diff --git a/.dir-locals.el b/.dir-locals.el
index d95eb4803f6..4b5393198de 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -5,6 +5,7 @@
   (c-file-style . "stroustrup")
   (fill-column . 78)
   (eval . (progn
+	    (c-set-offset 'case-label '0)
 	    (c-set-offset 'innamespace '0)
 	    (c-set-offset 'inline-open '0)))
   )
diff --git a/appveyor.yml b/appveyor.yml
index 68cc368a3a1..bf7ac752857 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -6,7 +6,7 @@
 # - Select Git and fill in the Git clone URL
 # - Setup a Git hook as explained in
 #   https://github.com/appveyor/webhooks#installing-git-hook
-# - Check 'Settings > General > Skip branches without appveyor'
+# - Check 'Settings > General > Skip branches without appveyor.yml'
 # - Check 'Settings > General > Rolling builds'
 # - Setup the global or project notifications to your liking
 #
@@ -24,7 +24,14 @@ branches:
   except:
   - /^travis.*$/
 
-clone_depth: 5
+# Don't download the full Mesa history to speed up cloning.  However the clone
+# depth must not be too small, otherwise builds might fail when lots of patches
+# are committed in succession, because the desired commit is not found on the
+# truncated history.
+#
+# See also:
+# - https://www.appveyor.com/blog/2014/06/04/shallow-clone-for-git-repositories
+clone_depth: 100
 
 cache:
 - win_flex_bison-2.4.5.zip
diff --git a/configure.ac b/configure.ac
index a18080d4ce5..e3d721d93aa 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2161,7 +2161,12 @@ gallium_require_drm_loader() {
     fi
 }
 
+dnl This is for Glamor. Skip this if OpenGL is disabled.
 require_egl_drm() {
+    if test "x$enable_opengl" = xno; then
+        return 0
+    fi
+
     case "$with_egl_platforms" in
         *drm*)
             ;;
diff --git a/docs/GL3.txt b/docs/GL3.txt
index f12e0ba8d29..257fc73225c 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -135,7 +135,7 @@ GL 4.2, GLSL 4.20:
 
   GL_ARB_texture_compression_bptc                      DONE (i965, nvc0, r600, radeonsi)
   GL_ARB_compressed_texture_pixel_storage              DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965)
+  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
   GL_ARB_texture_storage                               DONE (all drivers)
   GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
@@ -164,7 +164,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_program_interface_query                       DONE (all drivers)
   GL_ARB_robust_buffer_access_behavior                 not started
   GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965)
+  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
   GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
   GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
@@ -186,7 +186,7 @@ GL 4.4, GLSL 4.40:
   - specified transform/feedback layout                in progress
   - input/output block locations                       DONE
   GL_ARB_multi_bind                                    DONE (all drivers)
-  GL_ARB_query_buffer_object                           not started
+  GL_ARB_query_buffer_object                           DONE (nvc0)
   GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_stencil8                              DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_vertex_type_10f_11f_11f_rev                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/envvars.html b/docs/envvars.html
index 5bb7b1e65bb..ba83335d0b0 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -96,6 +96,7 @@ glGetString(GL_SHADING_LANGUAGE_VERSION). Valid values are integers, such as
 "130".  Mesa will not really implement all the features of the given language version
 if it's higher than what's normally reported. (for developers only)
 <li>MESA_GLSL - <a href="shading.html#envvars">shading language compiler options</a>
+<li>MESA_NO_MINMAX_CACHE - when set, the minmax index cache is globally disabled.
 </ul>
 
 
diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html
index 616c134a768..0d92ed41ee8 100644
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -48,7 +48,10 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
 <li>GL_ARB_indirect_parameters on nvc0</li>
+<li>GL_ARB_query_buffer_object on nvc0</li>
+<li>GL_ARB_shader_atomic_counters on nvc0</li>
 <li>GL_ARB_shader_draw_parameters on i965, nvc0</li>
+<li>GL_ARB_shader_storage_buffer_object on nvc0</li>
 <li>GL_ARB_tessellation_shader on i965 and r600 (evergreen/cayman only)</li>
 <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
 <li>GL_ARB_texture_buffer_range on freedreno/a4xx</li>
@@ -58,6 +61,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_vertex_type_10f_11f_11f_rev on freedreno/a4xx</li>
 <li>GL_KHR_texture_compression_astc_ldr on freedreno/a4xx</li>
 <li>GL_AMD_performance_monitor on radeonsi (CIK+ only)</li>
+<li>GL_ATI_meminfo on r600, radeonsi</li>
+<li>GL_NVX_gpu_memory_info on r600, radeonsi</li>
 <li>New OSMesaCreateContextAttribs() function (for creating core profile
     contexts)</li>
 </ul>
diff --git a/include/D3D9/d3d9types.h b/include/D3D9/d3d9types.h
index 52fbc99dad7..d74ce80bb30 100644
--- a/include/D3D9/d3d9types.h
+++ b/include/D3D9/d3d9types.h
@@ -227,6 +227,7 @@ typedef struct _RGNDATA {
 #define D3DERR_DRIVERINVALIDCALL         MAKE_D3DHRESULT(2157)
 #define D3DERR_DEVICEREMOVED             MAKE_D3DHRESULT(2160)
 #define D3DERR_DEVICEHUNG                MAKE_D3DHRESULT(2164)
+#define S_PRESENT_OCCLUDED               MAKE_D3DSTATUS(2168)
 
 /********************************************************
  * Bitmasks                                             *
diff --git a/include/d3dadapter/present.h b/include/d3dadapter/present.h
index 08a97297201..162f703e320 100644
--- a/include/d3dadapter/present.h
+++ b/include/d3dadapter/present.h
@@ -69,6 +69,8 @@ typedef struct ID3DPresentVtbl
     HRESULT (WINAPI *SetCursor)(ID3DPresent *This, void *pBitmap, POINT *pHotspot, BOOL bShow);
     HRESULT (WINAPI *SetGammaRamp)(ID3DPresent *This, const D3DGAMMARAMP *pRamp, HWND hWndOverride);
     HRESULT (WINAPI *GetWindowInfo)(ID3DPresent *This,  HWND hWnd, int *width, int *height, int *depth);
+    /* Available since version 1.1 */
+    BOOL (WINAPI *GetWindowOccluded)(ID3DPresent *This);
 } ID3DPresentVtbl;
 
 struct ID3DPresent
@@ -96,6 +98,7 @@ struct ID3DPresent
 #define ID3DPresent_SetCursor(p,a,b,c) (p)->lpVtbl->SetCursor(p,a,b,c)
 #define ID3DPresent_SetGammaRamp(p,a,b) (p)->lpVtbl->SetGammaRamp(p,a,b)
 #define ID3DPresent_GetWindowInfo(p,a,b,c,d) (p)->lpVtbl->GetWindowSize(p,a,b,c,d)
+#define ID3DPresent_GetWindowOccluded(p) (p)->lpVtbl->GetWindowOccluded(p)
 
 typedef struct ID3DPresentGroupVtbl
 {
diff --git a/src/compiler/.gitignore b/src/compiler/.gitignore
new file mode 100644
index 00000000000..6fb069f0bcb
--- /dev/null
+++ b/src/compiler/.gitignore
@@ -0,0 +1 @@
+glsl_compiler
diff --git a/src/compiler/Makefile.am b/src/compiler/Makefile.am
index e3d297fe299..fe96cb3c879 100644
--- a/src/compiler/Makefile.am
+++ b/src/compiler/Makefile.am
@@ -220,9 +220,11 @@ YACC_GEN = $(AM_V_YACC)$(YACC) $(YFLAGS)
 LEX_GEN = $(AM_V_LEX)$(LEX) $(LFLAGS)
 
 glsl/glsl_parser.cpp glsl/glsl_parser.h: glsl/glsl_parser.yy
+	$(MKDIR_GEN)
 	$(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl/glsl_parser.h $(srcdir)/glsl/glsl_parser.yy
 
 glsl/glsl_lexer.cpp: glsl/glsl_lexer.ll
+	$(MKDIR_GEN)
 	$(LEX_GEN) -o $@ $(srcdir)/glsl/glsl_lexer.ll
 
 glsl/glcpp/glcpp-parse.c glsl/glcpp/glcpp-parse.h: glsl/glcpp/glcpp-parse.y
diff --git a/src/compiler/glsl/.gitignore b/src/compiler/glsl/.gitignore
index e80f8af6bfc..6db4e738f6e 100644
--- a/src/compiler/glsl/.gitignore
+++ b/src/compiler/glsl/.gitignore
@@ -1,4 +1,3 @@
-glsl_compiler
 glsl_lexer.cpp
 glsl_parser.cpp
 glsl_parser.h
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 98d8bc5f268..7213ad8ebec 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -291,6 +291,10 @@ apply_implicit_conversion(const glsl_type *to, ir_rvalue * &from,
    if (!state->is_version(120, 0))
       return false;
 
+   /* ESSL does not allow implicit conversions */
+   if (state->es_shader)
+      return false;
+
    /* From page 27 (page 33 of the PDF) of the GLSL 1.50 spec:
     *
     *    "There are no implicit array or structure conversions. For
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index 95e86df1cdd..5512a33f114 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -661,7 +661,7 @@ private:
    BA1(roundEven)
    BA1(ceil)
    BA1(fract)
-   B2(mod)
+   BA2(mod)
    BA1(modf)
    BA2(min)
    BA2(max)
@@ -1242,23 +1242,23 @@ builtin_builder::create_builtins()
    FD(fract)
 
    add_function("mod",
-                _mod(glsl_type::float_type, glsl_type::float_type),
-                _mod(glsl_type::vec2_type,  glsl_type::float_type),
-                _mod(glsl_type::vec3_type,  glsl_type::float_type),
-                _mod(glsl_type::vec4_type,  glsl_type::float_type),
+                _mod(always_available, glsl_type::float_type, glsl_type::float_type),
+                _mod(always_available, glsl_type::vec2_type,  glsl_type::float_type),
+                _mod(always_available, glsl_type::vec3_type,  glsl_type::float_type),
+                _mod(always_available, glsl_type::vec4_type,  glsl_type::float_type),
 
-                _mod(glsl_type::vec2_type,  glsl_type::vec2_type),
-                _mod(glsl_type::vec3_type,  glsl_type::vec3_type),
-                _mod(glsl_type::vec4_type,  glsl_type::vec4_type),
+                _mod(always_available, glsl_type::vec2_type,  glsl_type::vec2_type),
+                _mod(always_available, glsl_type::vec3_type,  glsl_type::vec3_type),
+                _mod(always_available, glsl_type::vec4_type,  glsl_type::vec4_type),
 
-                _mod(glsl_type::double_type, glsl_type::double_type),
-                _mod(glsl_type::dvec2_type,  glsl_type::double_type),
-                _mod(glsl_type::dvec3_type,  glsl_type::double_type),
-                _mod(glsl_type::dvec4_type,  glsl_type::double_type),
+                _mod(fp64, glsl_type::double_type, glsl_type::double_type),
+                _mod(fp64, glsl_type::dvec2_type,  glsl_type::double_type),
+                _mod(fp64, glsl_type::dvec3_type,  glsl_type::double_type),
+                _mod(fp64, glsl_type::dvec4_type,  glsl_type::double_type),
 
-                _mod(glsl_type::dvec2_type,  glsl_type::dvec2_type),
-                _mod(glsl_type::dvec3_type,  glsl_type::dvec3_type),
-                _mod(glsl_type::dvec4_type,  glsl_type::dvec4_type),
+                _mod(fp64, glsl_type::dvec2_type,  glsl_type::dvec2_type),
+                _mod(fp64, glsl_type::dvec3_type,  glsl_type::dvec3_type),
+                _mod(fp64, glsl_type::dvec4_type,  glsl_type::dvec4_type),
                 NULL);
 
    FD(modf)
@@ -3452,9 +3452,10 @@ UNOPA(ceil,      ir_unop_ceil)
 UNOPA(fract,     ir_unop_fract)
 
 ir_function_signature *
-builtin_builder::_mod(const glsl_type *x_type, const glsl_type *y_type)
+builtin_builder::_mod(builtin_available_predicate avail,
+                      const glsl_type *x_type, const glsl_type *y_type)
 {
-   return binop(always_available, ir_binop_mod, x_type, x_type, y_type);
+   return binop(avail, ir_binop_mod, x_type, x_type, y_type);
 }
 
 ir_function_signature *
diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index ccc04c00cea..6db74f1c634 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -328,6 +328,11 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].sample = 0;
    this->fields[this->num_fields].patch = 0;
    this->fields[this->num_fields].precision = GLSL_PRECISION_NONE;
+   this->fields[this->num_fields].image_read_only = 0;
+   this->fields[this->num_fields].image_write_only = 0;
+   this->fields[this->num_fields].image_coherent = 0;
+   this->fields[this->num_fields].image_volatile = 0;
+   this->fields[this->num_fields].image_restrict = 0;
    this->num_fields++;
 }
 
@@ -1201,7 +1206,12 @@ builtin_variable_generator::generate_varyings()
    /* gl_Position and gl_PointSize are not visible from fragment shaders. */
    if (state->stage != MESA_SHADER_FRAGMENT) {
       add_varying(VARYING_SLOT_POS, vec4_t, "gl_Position");
-      add_varying(VARYING_SLOT_PSIZ, float_t, "gl_PointSize");
+      if (!state->es_shader ||
+          state->stage == MESA_SHADER_VERTEX ||
+          (state->stage == MESA_SHADER_GEOMETRY &&
+           state->OES_geometry_point_size_enable)) {
+         add_varying(VARYING_SLOT_PSIZ, float_t, "gl_PointSize");
+      }
    }
 
    if (state->is_version(130, 0)) {
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index ef1a6575aaa..43a1aa94aff 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2386,6 +2386,13 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
                  add_builtin_define(parser, "GL_OES_texture_storage_multisample_2d_array", 1);
               if (extensions->ARB_blend_func_extended)
                  add_builtin_define(parser, "GL_EXT_blend_func_extended", 1);
+
+              if (version >= 310) {
+                 if (extensions->OES_geometry_shader) {
+                    add_builtin_define(parser, "GL_OES_geometry_point_size", 1);
+                    add_builtin_define(parser, "GL_OES_geometry_shader", 1);
+                 }
+              }
 	   }
 	} else {
 	   add_builtin_define(parser, "GL_ARB_draw_buffers", 1);
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index ecf0d7f76e5..d7a4b254aa2 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -600,6 +600,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    /* OES extensions go here, sorted alphabetically.
     */
    EXT(OES_EGL_image_external,         false, true,      OES_EGL_image_external),
+   EXT(OES_geometry_point_size,        false, true,      OES_geometry_shader),
    EXT(OES_geometry_shader,            false, true,      OES_geometry_shader),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
    EXT(OES_texture_3D,                 false, true,      dummy_true),
@@ -1867,59 +1868,76 @@ do_common_optimization(exec_list *ir, bool linked,
                        const struct gl_shader_compiler_options *options,
                        bool native_integers)
 {
+   const bool debug = false;
    GLboolean progress = GL_FALSE;
 
-   progress = lower_instructions(ir, SUB_TO_ADD_NEG) || progress;
+#define OPT(PASS, ...) do {                                             \
+      if (debug) {                                                      \
+         fprintf(stderr, "START GLSL optimization %s\n", #PASS);        \
+         const bool opt_progress = PASS(__VA_ARGS__);                   \
+         progress = opt_progress || progress;                           \
+         if (opt_progress)                                              \
+            _mesa_print_ir(stderr, ir, NULL);                           \
+         fprintf(stderr, "GLSL optimization %s: %s progress\n",         \
+                 #PASS, opt_progress ? "made" : "no");                  \
+      } else {                                                          \
+         progress = PASS(__VA_ARGS__) || progress;                      \
+      }                                                                 \
+   } while (false)
+
+   OPT(lower_instructions, ir, SUB_TO_ADD_NEG);
 
    if (linked) {
-      progress = do_function_inlining(ir) || progress;
-      progress = do_dead_functions(ir) || progress;
-      progress = do_structure_splitting(ir) || progress;
+      OPT(do_function_inlining, ir);
+      OPT(do_dead_functions, ir);
+      OPT(do_structure_splitting, ir);
    }
-   progress = do_if_simplification(ir) || progress;
-   progress = opt_flatten_nested_if_blocks(ir) || progress;
-   progress = opt_conditional_discard(ir) || progress;
-   progress = do_copy_propagation(ir) || progress;
-   progress = do_copy_propagation_elements(ir) || progress;
+   OPT(do_if_simplification, ir);
+   OPT(opt_flatten_nested_if_blocks, ir);
+   OPT(opt_conditional_discard, ir);
+   OPT(do_copy_propagation, ir);
+   OPT(do_copy_propagation_elements, ir);
 
    if (options->OptimizeForAOS && !linked)
-      progress = opt_flip_matrices(ir) || progress;
+      OPT(opt_flip_matrices, ir);
 
    if (linked && options->OptimizeForAOS) {
-      progress = do_vectorize(ir) || progress;
+      OPT(do_vectorize, ir);
    }
 
    if (linked)
-      progress = do_dead_code(ir, uniform_locations_assigned) || progress;
+      OPT(do_dead_code, ir, uniform_locations_assigned);
    else
-      progress = do_dead_code_unlinked(ir) || progress;
-   progress = do_dead_code_local(ir) || progress;
-   progress = do_tree_grafting(ir) || progress;
-   progress = do_constant_propagation(ir) || progress;
+      OPT(do_dead_code_unlinked, ir);
+   OPT(do_dead_code_local, ir);
+   OPT(do_tree_grafting, ir);
+   OPT(do_constant_propagation, ir);
    if (linked)
-      progress = do_constant_variable(ir) || progress;
+      OPT(do_constant_variable, ir);
    else
-      progress = do_constant_variable_unlinked(ir) || progress;
-   progress = do_constant_folding(ir) || progress;
-   progress = do_minmax_prune(ir) || progress;
-   progress = do_rebalance_tree(ir) || progress;
-   progress = do_algebraic(ir, native_integers, options) || progress;
-   progress = do_lower_jumps(ir) || progress;
-   progress = do_vec_index_to_swizzle(ir) || progress;
-   progress = lower_vector_insert(ir, false) || progress;
-   progress = do_swizzle_swizzle(ir) || progress;
-   progress = do_noop_swizzle(ir) || progress;
+      OPT(do_constant_variable_unlinked, ir);
+   OPT(do_constant_folding, ir);
+   OPT(do_minmax_prune, ir);
+   OPT(do_rebalance_tree, ir);
+   OPT(do_algebraic, ir, native_integers, options);
+   OPT(do_lower_jumps, ir);
+   OPT(do_vec_index_to_swizzle, ir);
+   OPT(lower_vector_insert, ir, false);
+   OPT(do_swizzle_swizzle, ir);
+   OPT(do_noop_swizzle, ir);
 
-   progress = optimize_split_arrays(ir, linked) || progress;
-   progress = optimize_redundant_jumps(ir) || progress;
+   OPT(optimize_split_arrays, ir, linked);
+   OPT(optimize_redundant_jumps, ir);
 
    loop_state *ls = analyze_loop_variables(ir);
    if (ls->loop_found) {
-      progress = set_loop_controls(ir, ls) || progress;
-      progress = unroll_loops(ir, ls, options) || progress;
+      OPT(set_loop_controls, ir, ls);
+      OPT(unroll_loops, ir, ls, options);
    }
    delete ls;
 
+#undef OPT
+
    return progress;
 }
 
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 3f88e01d599..a905b564787 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -591,6 +591,8 @@ struct _mesa_glsl_parse_state {
     */
    bool OES_EGL_image_external_enable;
    bool OES_EGL_image_external_warn;
+   bool OES_geometry_point_size_enable;
+   bool OES_geometry_point_size_warn;
    bool OES_geometry_shader_enable;
    bool OES_geometry_shader_warn;
    bool OES_standard_derivatives_enable;
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 33b2d4c8646..7072c16cb28 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -471,10 +471,11 @@ private:
  */
 class parcel_out_uniform_storage : public program_resource_visitor {
 public:
-   parcel_out_uniform_storage(struct string_to_uint_map *map,
+   parcel_out_uniform_storage(struct gl_shader_program *prog,
+                              struct string_to_uint_map *map,
 			      struct gl_uniform_storage *uniforms,
 			      union gl_constant_value *values)
-      : map(map), uniforms(uniforms), values(values)
+      : prog(prog), map(map), uniforms(uniforms), values(values)
    {
    }
 
@@ -492,8 +493,7 @@ public:
       memset(this->targets, 0, sizeof(this->targets));
    }
 
-   void set_and_process(struct gl_shader_program *prog,
-			ir_variable *var)
+   void set_and_process(ir_variable *var)
    {
       current_var = var;
       field_counter = 0;
@@ -643,6 +643,16 @@ private:
          uniform->opaque[shader_type].index = this->next_image;
          uniform->opaque[shader_type].active = true;
 
+         /* Set image access qualifiers */
+         const GLenum access =
+            (current_var->data.image_read_only ? GL_READ_ONLY :
+             current_var->data.image_write_only ? GL_WRITE_ONLY :
+                GL_READ_WRITE);
+
+         for (unsigned j = 0; j < MAX2(1, uniform->array_elements); ++j)
+            prog->_LinkedShaders[shader_type]->
+               ImageAccess[this->next_image + j] = access;
+
          /* Increment the image index by 1 for non-arrays and by the
           * number of array elements for arrays.
           */
@@ -844,6 +854,11 @@ private:
       this->values += values_for_type(type);
    }
 
+   /**
+    * Current program being processed.
+    */
+   struct gl_shader_program *prog;
+
    struct string_to_uint_map *map;
 
    struct gl_uniform_storage *uniforms;
@@ -1007,40 +1022,6 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
    }
 }
 
-static void
-link_set_image_access_qualifiers(struct gl_shader_program *prog,
-                                 gl_shader *sh, unsigned shader_stage,
-                                 ir_variable *var, const glsl_type *type,
-                                 char **name, size_t name_length)
-{
-   /* Handle arrays of arrays */
-   if (type->is_array() && type->fields.array->is_array()) {
-      for (unsigned i = 0; i < type->length; i++) {
-	 size_t new_length = name_length;
-
-	 /* Append the subscript to the current variable name */
-	 ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
-
-         link_set_image_access_qualifiers(prog, sh, shader_stage, var,
-                                          type->fields.array, name,
-                                          new_length);
-      }
-   } else {
-      unsigned id = 0;
-      bool found = prog->UniformHash->get(id, *name);
-      assert(found);
-      (void) found;
-      const gl_uniform_storage *storage = &prog->UniformStorage[id];
-      const unsigned index = storage->opaque[shader_stage].index;
-      const GLenum access = (var->data.image_read_only ? GL_READ_ONLY :
-                             var->data.image_write_only ? GL_WRITE_ONLY :
-                             GL_READ_WRITE);
-
-      for (unsigned j = 0; j < MAX2(1, storage->array_elements); ++j)
-         sh->ImageAccess[index + j] = access;
-   }
-}
-
 /**
  * Combine the hidden uniform hash map with the uniform hash map so that the
  * hidden uniforms will be given indicies at the end of the uniform storage
@@ -1148,7 +1129,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    union gl_constant_value *data_end = &data[num_data_slots];
 #endif
 
-   parcel_out_uniform_storage parcel(prog->UniformHash, uniforms, data);
+   parcel_out_uniform_storage parcel(prog, prog->UniformHash, uniforms, data);
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
@@ -1163,7 +1144,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
                                var->data.mode != ir_var_shader_storage))
 	    continue;
 
-	 parcel.set_and_process(prog, var);
+	 parcel.set_and_process(var);
       }
 
       prog->_LinkedShaders[i]->active_samplers = parcel.shader_samplers_used;
@@ -1301,29 +1282,6 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    prog->NumHiddenUniforms = hidden_uniforms;
    prog->UniformStorage = uniforms;
 
-   /**
-    * Scan the program for image uniforms and store image unit access
-    * information into the gl_shader data structure.
-    */
-   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      gl_shader *sh = prog->_LinkedShaders[i];
-
-      if (sh == NULL)
-	 continue;
-
-      foreach_in_list(ir_instruction, node, sh->ir) {
-	 ir_variable *var = node->as_variable();
-
-         if (var && var->data.mode == ir_var_uniform &&
-             var->type->contains_image()) {
-            char *name_copy = ralloc_strdup(NULL, var->name);
-            link_set_image_access_qualifiers(prog, sh, i, var, var->type,
-                                             &name_copy, strlen(var->name));
-            ralloc_free(name_copy);
-         }
-      }
-   }
-
    link_set_uniform_initializers(prog, boolean_true);
 
    return;
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 264b69ca619..a4c730ffdcf 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -967,11 +967,16 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
       return;
    }
 
-   if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
+   bool needs_flat_qualifier = consumer_var == NULL &&
+      (producer_var->type->contains_integer() ||
+       producer_var->type->contains_double());
+
+   if (needs_flat_qualifier ||
        (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT)) {
       /* Since this varying is not being consumed by the fragment shader, its
        * interpolation type varying cannot possibly affect rendering.
-       * Also, this variable is non-flat and is (or contains) an integer.
+       * Also, this variable is non-flat and is (or contains) an integer
+       * or a double.
        * If the consumer stage is unknown, don't modify the interpolation
        * type as it could affect rendering later with separate shaders.
        *
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 6657777d74c..4776ffa6acd 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -4633,8 +4633,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                         &prog->NumShaderStorageBlocks,
                         &prog->SsboInterfaceBlockIndex);
 
-   /* FINISHME: Assign fragment shader output locations. */
-
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
 	 continue;
diff --git a/src/compiler/glsl/lower_buffer_access.cpp b/src/compiler/glsl/lower_buffer_access.cpp
index f8c8d140ea8..9ad811de9f1 100644
--- a/src/compiler/glsl/lower_buffer_access.cpp
+++ b/src/compiler/glsl/lower_buffer_access.cpp
@@ -327,6 +327,7 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
                                          unsigned *const_offset,
                                          bool *row_major,
                                          int *matrix_columns,
+                                         const glsl_struct_field **struct_field,
                                          unsigned packing)
 {
    *offset = new(mem_ctx) ir_constant(0u);
@@ -442,8 +443,11 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
             intra_struct_offset = glsl_align(intra_struct_offset, field_align);
 
             if (strcmp(struct_type->fields.structure[i].name,
-                       deref_record->field) == 0)
+                       deref_record->field) == 0) {
+               if (struct_field)
+                  *struct_field = &struct_type->fields.structure[i];
                break;
+            }
 
             if (packing == GLSL_INTERFACE_PACKING_STD430)
                intra_struct_offset += type->std430_size(field_row_major);
diff --git a/src/compiler/glsl/lower_buffer_access.h b/src/compiler/glsl/lower_buffer_access.h
index cc4614e9792..8772bdb76ff 100644
--- a/src/compiler/glsl/lower_buffer_access.h
+++ b/src/compiler/glsl/lower_buffer_access.h
@@ -57,6 +57,7 @@ public:
    void setup_buffer_access(void *mem_ctx, ir_variable *var, ir_rvalue *deref,
                             ir_rvalue **offset, unsigned *const_offset,
                             bool *row_major, int *matrix_columns,
+                            const glsl_struct_field **struct_field,
                             unsigned packing);
 };
 
diff --git a/src/compiler/glsl/lower_shared_reference.cpp b/src/compiler/glsl/lower_shared_reference.cpp
index 533cd9202f4..12499695882 100644
--- a/src/compiler/glsl/lower_shared_reference.cpp
+++ b/src/compiler/glsl/lower_shared_reference.cpp
@@ -142,7 +142,7 @@ lower_shared_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 
    setup_buffer_access(mem_ctx, var, deref,
                        &offset, &const_offset,
-                       &row_major, &matrix_columns, packing);
+                       &row_major, &matrix_columns, NULL, packing);
 
    /* Now that we've calculated the offset to the start of the
     * dereference, walk over the type and emit loads into a temporary.
@@ -210,7 +210,7 @@ lower_shared_reference_visitor::handle_assignment(ir_assignment *ir)
 
    setup_buffer_access(mem_ctx, var, deref,
                        &offset, &const_offset,
-                       &row_major, &matrix_columns, packing);
+                       &row_major, &matrix_columns, NULL, packing);
 
    deref = new(mem_ctx) ir_dereference_variable(store_var);
 
@@ -370,7 +370,7 @@ lower_shared_reference_visitor::lower_shared_atomic_intrinsic(ir_call *ir)
 
    setup_buffer_access(mem_ctx, var, deref,
                        &offset, &const_offset,
-                       &row_major, &matrix_columns, packing);
+                       &row_major, &matrix_columns, NULL, packing);
 
    assert(offset);
    assert(!row_major);
diff --git a/src/compiler/glsl/lower_ubo_reference.cpp b/src/compiler/glsl/lower_ubo_reference.cpp
index a172054bac8..d6269f7cbac 100644
--- a/src/compiler/glsl/lower_ubo_reference.cpp
+++ b/src/compiler/glsl/lower_ubo_reference.cpp
@@ -45,7 +45,7 @@ class lower_ubo_reference_visitor :
       public lower_buffer_access::lower_buffer_access {
 public:
    lower_ubo_reference_visitor(struct gl_shader *shader)
-   : shader(shader)
+   : shader(shader), struct_field(NULL), variable(NULL)
    {
    }
 
@@ -60,6 +60,7 @@ public:
                                 bool *row_major,
                                 int *matrix_columns,
                                 unsigned packing);
+   uint32_t ssbo_access_params();
    ir_expression *ubo_load(void *mem_ctx, const struct glsl_type *type,
 			   ir_rvalue *offset);
    ir_call *ssbo_load(void *mem_ctx, const struct glsl_type *type,
@@ -104,6 +105,8 @@ public:
 
    struct gl_shader *shader;
    struct gl_uniform_buffer_variable *ubo_var;
+   const struct glsl_struct_field *struct_field;
+   ir_variable *variable;
    ir_rvalue *uniform_block;
    bool progress;
 };
@@ -288,8 +291,9 @@ lower_ubo_reference_visitor::setup_for_load_or_store(void *mem_ctx,
 
    *const_offset = ubo_var->Offset;
 
+   this->struct_field = NULL;
    setup_buffer_access(mem_ctx, var, deref, offset, const_offset, row_major,
-                       matrix_columns, packing);
+                       matrix_columns, &this->struct_field, packing);
 }
 
 void
@@ -317,6 +321,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
    this->buffer_access_type =
       var->is_in_shader_storage_block() ?
       ssbo_load_access : ubo_load_access;
+   this->variable = var;
 
    /* Compute the offset to the start if the dereference as well as other
     * information we need to configure the write
@@ -370,6 +375,24 @@ shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
    return state->ARB_shader_storage_buffer_object_enable;
 }
 
+uint32_t
+lower_ubo_reference_visitor::ssbo_access_params()
+{
+   assert(variable);
+
+   if (variable->is_interface_instance()) {
+      assert(struct_field);
+
+      return ((struct_field->image_coherent ? ACCESS_COHERENT : 0) |
+              (struct_field->image_restrict ? ACCESS_RESTRICT : 0) |
+              (struct_field->image_volatile ? ACCESS_VOLATILE : 0));
+   } else {
+      return ((variable->data.image_coherent ? ACCESS_COHERENT : 0) |
+              (variable->data.image_restrict ? ACCESS_RESTRICT : 0) |
+              (variable->data.image_volatile ? ACCESS_VOLATILE : 0));
+   }
+}
+
 ir_call *
 lower_ubo_reference_visitor::ssbo_store(void *mem_ctx,
                                         ir_rvalue *deref,
@@ -394,6 +417,10 @@ lower_ubo_reference_visitor::ssbo_store(void *mem_ctx,
       ir_variable(glsl_type::uint_type, "write_mask" , ir_var_function_in);
    sig_params.push_tail(writemask_ref);
 
+   ir_variable *access_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "access" , ir_var_function_in);
+   sig_params.push_tail(access_ref);
+
    ir_function_signature *sig = new(mem_ctx)
       ir_function_signature(glsl_type::void_type, shader_storage_buffer_object);
    assert(sig);
@@ -408,6 +435,7 @@ lower_ubo_reference_visitor::ssbo_store(void *mem_ctx,
    call_params.push_tail(offset->clone(mem_ctx, NULL));
    call_params.push_tail(deref->clone(mem_ctx, NULL));
    call_params.push_tail(new(mem_ctx) ir_constant(write_mask));
+   call_params.push_tail(new(mem_ctx) ir_constant(ssbo_access_params()));
    return new(mem_ctx) ir_call(sig, NULL, &call_params);
 }
 
@@ -426,6 +454,10 @@ lower_ubo_reference_visitor::ssbo_load(void *mem_ctx,
       ir_variable(glsl_type::uint_type, "offset_ref" , ir_var_function_in);
    sig_params.push_tail(offset_ref);
 
+   ir_variable *access_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "access" , ir_var_function_in);
+   sig_params.push_tail(access_ref);
+
    ir_function_signature *sig =
       new(mem_ctx) ir_function_signature(type, shader_storage_buffer_object);
    assert(sig);
@@ -444,6 +476,7 @@ lower_ubo_reference_visitor::ssbo_load(void *mem_ctx,
    exec_list call_params;
    call_params.push_tail(this->uniform_block->clone(mem_ctx, NULL));
    call_params.push_tail(offset->clone(mem_ctx, NULL));
+   call_params.push_tail(new(mem_ctx) ir_constant(ssbo_access_params()));
 
    return new(mem_ctx) ir_call(sig, deref_result, &call_params);
 }
@@ -499,6 +532,7 @@ lower_ubo_reference_visitor::write_to_memory(void *mem_ctx,
    unsigned packing = var->get_interface_type()->interface_packing;
 
    this->buffer_access_type = ssbo_store_access;
+   this->variable = var;
 
    /* Compute the offset to the start if the dereference as well as other
     * information we need to configure the write
@@ -678,6 +712,7 @@ lower_ubo_reference_visitor::process_ssbo_unsized_array_length(ir_rvalue **rvalu
    int unsized_array_stride = calculate_unsized_array_stride(deref, packing);
 
    this->buffer_access_type = ssbo_unsized_array_length_access;
+   this->variable = var;
 
    /* Compute the offset to the start if the dereference as well as other
     * information we need to calculate the length.
@@ -910,6 +945,7 @@ lower_ubo_reference_visitor::lower_ssbo_atomic_intrinsic(ir_call *ir)
    unsigned packing = var->get_interface_type()->interface_packing;
 
    this->buffer_access_type = ssbo_atomic_access;
+   this->variable = var;
 
    setup_for_load_or_store(mem_ctx, var, deref,
                            &offset, &const_offset,
diff --git a/src/compiler/glsl/opt_tree_grafting.cpp b/src/compiler/glsl/opt_tree_grafting.cpp
index 83effb7424c..812f996fb81 100644
--- a/src/compiler/glsl/opt_tree_grafting.cpp
+++ b/src/compiler/glsl/opt_tree_grafting.cpp
@@ -361,11 +361,12 @@ tree_grafting_basic_block(ir_instruction *bb_first,
       if (!lhs_var)
 	 continue;
 
-   if (lhs_var->data.mode == ir_var_function_out ||
-       lhs_var->data.mode == ir_var_function_inout ||
-       lhs_var->data.mode == ir_var_shader_out ||
-       lhs_var->data.mode == ir_var_shader_storage)
-      continue;
+      if (lhs_var->data.mode == ir_var_function_out ||
+          lhs_var->data.mode == ir_var_function_inout ||
+          lhs_var->data.mode == ir_var_shader_out ||
+          lhs_var->data.mode == ir_var_shader_storage ||
+          lhs_var->data.mode == ir_var_shader_shared)
+         continue;
 
       ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);
 
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 5920c2e2611..d2eaec173b3 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -164,6 +164,11 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].sample = fields[i].sample;
       this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
       this->fields.structure[i].patch = fields[i].patch;
+      this->fields.structure[i].image_read_only = fields[i].image_read_only;
+      this->fields.structure[i].image_write_only = fields[i].image_write_only;
+      this->fields.structure[i].image_coherent = fields[i].image_coherent;
+      this->fields.structure[i].image_volatile = fields[i].image_volatile;
+      this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
    }
 
@@ -1330,6 +1335,13 @@ glsl_type::can_implicitly_convert_to(const glsl_type *desired,
    if (this == desired)
       return true;
 
+   /* ESSL does not allow implicit conversions. If there is no state, we're
+    * doing intra-stage function linking where these checks have already been
+    * done.
+    */
+   if (state && state->es_shader)
+      return false;
+
    /* There is no conversion among matrix types. */
    if (this->matrix_columns > 1 || desired->matrix_columns > 1)
       return false;
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index a9b5281e774..5965cb2eedb 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -885,7 +885,8 @@ struct glsl_struct_field {
    glsl_struct_field(const struct glsl_type *_type, const char *_name)
       : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
         sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
-        precision(GLSL_PRECISION_NONE)
+        precision(GLSL_PRECISION_NONE), image_read_only(0), image_write_only(0),
+        image_coherent(0), image_volatile(0), image_restrict(0)
    {
       /* empty */
    }
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index 37cb0221e0b..312d2f99a1c 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -139,7 +139,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
              b->shader->options->lower_pack_unorm_2x16);
 
       nir_ssa_def *word =
-         nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+         nir_extract_u16(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
       nir_ssa_def *val =
          nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
                                 nir_channel(b, word, 0));
@@ -154,7 +154,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
              b->shader->options->lower_pack_unorm_4x8);
 
       nir_ssa_def *byte =
-         nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+         nir_extract_u8(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
       nir_ssa_def *val =
          nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
                                nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 0eff89783dd..60ade4a80ae 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -238,15 +238,15 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")
 
-unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """
-dst = (src0.x & 0xffff) | (src0.y >> 16);
+unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """
+dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 """)
 
-unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """
-dst = (src0.x <<  0) |
-      (src0.y <<  8) |
-      (src0.z << 16) |
-      (src0.w << 24);
+unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """
+dst.x = (src0.x <<  0) |
+        (src0.y <<  8) |
+        (src0.z << 16) |
+        (src0.w << 24);
 """)
 
 # Lowered floating point unpacking operations.
@@ -562,12 +562,12 @@ dst.y = src1.x;
 """)
 
 # Byte extraction
-binop("extract_ubyte", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
-binop("extract_ibyte", tint, "", "(int8_t)(src0 >> (src1 * 8))")
+binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
+binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 
 # Word extraction
-binop("extract_uword", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
-binop("extract_iword", tint, "", "(int16_t)(src0 >> (src1 * 16))")
+binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
+binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 
 
 def triop(name, ty, const_expr):
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index f4bfd3a921a..d4f4a3d903c 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -248,19 +248,19 @@ optimizations = [
               ('ubfe', 'value', 'offset', 'bits')),
     'options->lower_bitfield_extract'),
 
-   (('extract_ibyte', a, b),
-    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 8),
+   (('extract_i8', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
     'options->lower_extract_byte'),
 
-   (('extract_ubyte', a, b),
+   (('extract_u8', a, b),
     ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
     'options->lower_extract_byte'),
 
-   (('extract_iword', a, b),
+   (('extract_i16', a, b),
     ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
     'options->lower_extract_word'),
 
-   (('extract_uword', a, b),
+   (('extract_u16', a, b),
     ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
     'options->lower_extract_word'),
 
@@ -285,30 +285,30 @@ optimizations = [
      'options->lower_pack_snorm_4x8'),
 
     (('unpack_unorm_2x16', 'v'),
-     ('fdiv', ('u2f', ('vec4', ('extract_uword', 'v', 0),
-                               ('extract_uword', 'v', 1), 0, 0)),
+     ('fdiv', ('u2f', ('vec2', ('extract_u16', 'v', 0),
+                               ('extract_u16', 'v', 1))),
               65535.0),
      'options->lower_unpack_unorm_2x16'),
 
     (('unpack_unorm_4x8', 'v'),
-     ('fdiv', ('u2f', ('vec4', ('extract_ubyte', 'v', 0),
-                               ('extract_ubyte', 'v', 1),
-                               ('extract_ubyte', 'v', 2),
-                               ('extract_ubyte', 'v', 3))),
+     ('fdiv', ('u2f', ('vec4', ('extract_u8', 'v', 0),
+                               ('extract_u8', 'v', 1),
+                               ('extract_u8', 'v', 2),
+                               ('extract_u8', 'v', 3))),
               255.0),
      'options->lower_unpack_unorm_4x8'),
 
     (('unpack_snorm_2x16', 'v'),
-     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_iword', 'v', 0),
-                                                            ('extract_iword', 'v', 1), 0, 0)),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
+                                                            ('extract_i16', 'v', 1))),
                                            32767.0))),
      'options->lower_unpack_snorm_2x16'),
 
     (('unpack_snorm_4x8', 'v'),
-     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_ibyte', 'v', 0),
-                                                            ('extract_ibyte', 'v', 1),
-                                                            ('extract_ibyte', 'v', 2),
-                                                            ('extract_ibyte', 'v', 3))),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
+                                                            ('extract_i8', 'v', 1),
+                                                            ('extract_i8', 'v', 2),
+                                                            ('extract_i8', 'v', 3))),
                                            127.0))),
      'options->lower_unpack_snorm_4x8'),
 ]
diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index e3f46e3d739..d44aabf8f3c 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -544,6 +544,16 @@ enum gl_frag_depth_layout
    FRAG_DEPTH_LAYOUT_UNCHANGED
 };
 
+/**
+ * \brief Buffer access qualifiers
+ */
+enum gl_buffer_access_qualifier
+{
+   ACCESS_COHERENT = 1,
+   ACCESS_RESTRICT = 2,
+   ACCESS_VOLATILE = 4,
+};
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index 749be7dfeb9..2b469b65ee4 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -85,7 +85,7 @@ endif
 
 # virgl
 ifneq ($(filter virgl, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += winsys/virgl/drm drivers/virgl
+SUBDIRS += winsys/virgl/drm winsys/virgl/vtest drivers/virgl
 endif
 
 # vmwgfx
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 7854142f736..7cf0deece81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
  *
  * Convert float32 to half floats, preserving Infs and NaNs,
  * with rounding towards zero (trunc).
+ * XXX: For GL, would prefer rounding towards nearest(-even).
  */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
    struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
    LLVMValueRef result;
 
+   /*
+    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
+    * directly, without any (x86 or generic) intrinsics.
+    * Albeit the rounding mode cannot be specified (and is undefined,
+    * though in practice on x86 seems to do nearest-even but it may
+    * be dependent on instruction set support), so is essentially
+    * useless.
+    */
+
    if (util_cpu_caps.has_f16c &&
        (length == 4 || length == 8)) {
       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
@@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
         LLVMValueRef index = LLVMConstInt(i32t, i, 0);
         LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 #if 0
-        /* XXX: not really supported by backends */
+        /*
+         * XXX: not really supported by backends.
+         * Even if they would now, rounding mode cannot be specified and
+         * is undefined.
+         */
         LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 #else
         LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 0b0f7f0147c..d80c997ad84 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -256,6 +256,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Un-interleave vector.
+ * This will return a vector consisting of every second element
+ * (depending on lo_hi, beginning at 0 or 1).
+ * The returned vector size (elems and width) will only be half
+ * that of the source vector.
+ */
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi)
+{
+   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
+
+   for (i = 0; i < num_elems / 2; ++i)
+      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
+
+   shuffle = LLVMConstVector(elems, num_elems / 2);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
+}
+
+
 /**
  * Interleave vector elements.
  *
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 7cede35bbde..367fba1fd21 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
                      LLVMValueRef b,
                      unsigned lo_hi);
 
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi);
 
 void
 lp_build_unpack2(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index c88dfbf974a..1cbe47ca91f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
    /* Ignore deprecated instructions */
    switch (inst->Instruction.Opcode) {
 
-   case TGSI_OPCODE_UP2H:
    case TGSI_OPCODE_UP2US:
    case TGSI_OPCODE_UP4B:
    case TGSI_OPCODE_UP4UB:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 6f75bec5005..43af6b4ea0d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -45,8 +45,10 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
 
 #include "tgsi/tgsi_exec.h"
 
@@ -530,6 +532,77 @@ static struct lp_build_tgsi_action log_action = {
    log_emit	 /* emit */
 };
 
+/* TGSI_OPCODE_PK2H */
+
+static void
+pk2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_X);
+   /* src0.y */
+   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_Y);
+}
+
+static void
+pk2h_emit(
+   const struct lp_build_tgsi_action *action,
+   struct lp_build_tgsi_context *bld_base,
+   struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_type f16i_t;
+   LLVMValueRef lo, hi, res;
+
+   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
+   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
+   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
+   /* maybe some interleave doubling vector width would be useful... */
+   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length * 2);
+   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length * 2);
+   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
+
+   emit_data->output[emit_data->chan] = res;
+}
+
+static struct lp_build_tgsi_action pk2h_action = {
+   pk2h_fetch_args, /* fetch_args */
+   pk2h_emit        /* emit */
+};
+
+/* TGSI_OPCODE_UP2H */
+
+static void
+up2h_emit(
+   const struct lp_build_tgsi_action *action,
+   struct lp_build_tgsi_context *bld_base,
+   struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef lo, hi, res[2], arg;
+   unsigned nr = bld_base->base.type.length;
+   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context), nr * 2);
+
+   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
+   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
+   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
+   res[0] = lp_build_half_to_float(gallivm, lo);
+   res[1] = lp_build_half_to_float(gallivm, hi);
+
+   emit_data->output[0] = emit_data->output[2] = res[0];
+   emit_data->output[1] = emit_data->output[3] = res[1];
+}
+
+static struct lp_build_tgsi_action up2h_action = {
+   scalar_unary_fetch_args, /* fetch_args */
+   up2h_emit                /* emit */
+};
+
 /* TGSI_OPCODE_LRP */
 
 static void
@@ -1032,10 +1105,12 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
    bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
    bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
    bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
+   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
    bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
    bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
    bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
    bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
+   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
    bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;
 
    bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
index 332b1cba984..90820d3fe91 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -226,14 +226,9 @@ pipe_freedreno_create_screen(int fd)
 struct pipe_screen *
 pipe_virgl_create_screen(int fd)
 {
-   struct virgl_winsys *vws;
    struct pipe_screen *screen;
 
-   vws = virgl_drm_winsys_create(fd);
-   if (!vws)
-      return NULL;
-
-   screen = virgl_create_screen(vws);
+   screen = virgl_drm_screen_create(fd);
    return screen ? debug_screen_wrap(screen) : NULL;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index f67c16200a9..d898fd66f48 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -58,6 +58,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_half.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -3057,6 +3058,45 @@ exec_dp2(struct tgsi_exec_machine *mach,
    }
 }
 
+static void
+exec_pk2h(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned chan;
+   union tgsi_exec_channel arg[2], dst;
+
+   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+      dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
+         (util_float_to_half(arg[1].f[chan]) << 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
+      }
+   }
+}
+
+static void
+exec_up2h(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned chan;
+   union tgsi_exec_channel arg, dst[2];
+
+   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+      dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
+      dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
 static void
 exec_scs(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst)
@@ -4339,7 +4379,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_PK2H:
-      assert (0);
+      exec_pk2h(mach, inst);
       break;
 
    case TGSI_OPCODE_PK2US:
@@ -4425,7 +4465,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_UP2H:
-      assert (0);
+      exec_up2h(mach, inst);
       break;
 
    case TGSI_OPCODE_UP2US:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index b270dd73b67..70fc4604537 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -149,7 +149,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 2, 0, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
    { 1, 2, 0, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
    { 1, 2, 0, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
-   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, OTHR, "MEMBAR", TGSI_OPCODE_MEMBAR },
    { 0, 1, 0, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
    { 0, 1, 0, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
    { 0, 1, 0, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
@@ -426,6 +426,7 @@ tgsi_opcode_infer_src_type( uint opcode )
    case TGSI_OPCODE_SAMPLE_I:
    case TGSI_OPCODE_SAMPLE_I_MS:
    case TGSI_OPCODE_UMUL_HI:
+   case TGSI_OPCODE_UP2H:
       return TGSI_TYPE_UNSIGNED;
    case TGSI_OPCODE_IMUL_HI:
    case TGSI_OPCODE_I2F:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 7a02e27e01e..687fb54830d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -377,6 +377,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                      info->reads_position = TRUE;
                   else if (semName == TGSI_SEMANTIC_FACE)
                      info->uses_frontface = TRUE;
+                  else if (semName == TGSI_SEMANTIC_SAMPLEMASK)
+                     info->reads_samplemask = TRUE;
                }
                else if (file == TGSI_FILE_OUTPUT) {
                   info->output_semantic_name[reg] = (ubyte) semName;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index b0b423ab528..0541255764c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -81,6 +81,7 @@ struct tgsi_shader_info
    ubyte colors_written;
    boolean reads_position; /**< does fragment shader read position? */
    boolean reads_z; /**< does fragment shader read depth? */
+   boolean reads_samplemask; /**< does fragment shader read sample mask? */
    boolean writes_z;  /**< does fragment shader write Z value? */
    boolean writes_stencil; /**< does fragment shader write stencil value? */
    boolean writes_samplemask; /**< does fragment shader write sample mask? */
diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h
index 66cf989a830..00f231dc683 100644
--- a/src/gallium/auxiliary/util/u_box.h
+++ b/src/gallium/auxiliary/util/u_box.h
@@ -195,4 +195,16 @@ u_box_minify_2d(struct pipe_box *dst,
    dst->height = MAX2(src->height >> l, 1);
 }
 
+static inline void
+u_box_minify_3d(struct pipe_box *dst,
+                const struct pipe_box *src, unsigned l)
+{
+   dst->x = src->x >> l;
+   dst->y = src->y >> l;
+   dst->z = src->z >> l;
+   dst->width = MAX2(src->width >> l, 1);
+   dst->height = MAX2(src->height >> l, 1);
+   dst->depth = MAX2(src->depth >> l, 1);
+}
+
 #endif
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index c719d3a77f0..a84de4fef7b 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -52,7 +52,7 @@
 #include <machine/cpu.h>
 #endif
 
-#if defined(PIPE_OS_FREEBSD)
+#if defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_DRAGONFLY)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
diff --git a/src/gallium/auxiliary/util/u_format_parse.py b/src/gallium/auxiliary/util/u_format_parse.py
index 929017a4486..d83603faa78 100755
--- a/src/gallium/auxiliary/util/u_format_parse.py
+++ b/src/gallium/auxiliary/util/u_format_parse.py
@@ -313,7 +313,7 @@ def _parse_channels(fields, layout, colorspace, swizzles):
     return channels
 
 def parse(filename):
-    '''Parse the format descrition in CSV format in terms of the 
+    '''Parse the format description in CSV format in terms of the
     Channel and Format classes above.'''
 
     stream = open(filename)
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
index d28fae3c77d..966d213bdd5 100644
--- a/src/gallium/auxiliary/util/u_half.h
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -74,7 +74,11 @@ util_float_to_half(float f)
       f32.ui &= round_mask;
       f32.f  *= magic.f;
       f32.ui -= round_mask;
-
+      /*
+       * XXX: The magic mul relies on denorms being available, otherwise
+       * all f16 denorms get flushed to zero - hence when this is used
+       * for tgsi_exec in softpipe we won't get f16 denorms.
+       */
       /*
        * Clamp to max finite value if overflowed.
        * OpenGL has completely undefined rounding behavior for float to
@@ -112,6 +116,7 @@ util_half_to_float(uint16_t f16)
 
    /* Adjust */
    f32.f *= magic.f;
+   /* XXX: The magic mul relies on denorms being available */
 
    /* Inf / NaN */
    if (f32.f >= infnan.f)
diff --git a/src/gallium/auxiliary/vl/vl_zscan.c b/src/gallium/auxiliary/vl/vl_zscan.c
index 1c6cdd4f2c9..5241471f516 100644
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@@ -49,6 +49,13 @@ enum VS_OUTPUT
    VS_O_VTEX = 0
 };
 
+const int vl_zscan_normal_16[] =
+{
+   /* Zig-Zag scan pattern */
+    0, 1, 4, 8, 5, 2, 3, 6,
+    9,12,13,10, 7,11,14,15
+};
+
 const int vl_zscan_linear[] =
 {
    /* Linear scan pattern */
diff --git a/src/gallium/auxiliary/vl/vl_zscan.h b/src/gallium/auxiliary/vl/vl_zscan.h
index eacee2db64f..268cf0a6e32 100644
--- a/src/gallium/auxiliary/vl/vl_zscan.h
+++ b/src/gallium/auxiliary/vl/vl_zscan.h
@@ -64,6 +64,7 @@ struct vl_zscan_buffer
    struct pipe_surface *dst;
 };
 
+extern const int vl_zscan_normal_16[];
 extern const int vl_zscan_linear[];
 extern const int vl_zscan_normal[];
 extern const int vl_zscan_alternate[];
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 4c03e00008c..904e1ff04e7 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -325,6 +325,11 @@ returned).  Otherwise, if the ``wait`` parameter is FALSE, the call
 will not block and the return value will be TRUE if the query has
 completed or FALSE otherwise.
 
+``get_query_result_resource`` is used to store the result of a query into
+a resource without synchronizing with the CPU. This write will optionally
+wait for the query to complete, and will optionally write whether the value
+is available instead of the value itself.
+
 The interface currently includes the following types of queries:
 
 ``PIPE_QUERY_OCCLUSION_COUNTER`` counts the number of fragments which
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index b461810644a..3324bcca6f4 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -138,6 +138,10 @@ The integer capabilities:
 * ``PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT``: Describes the required
   alignment for pipe_sampler_view::u.buf.first_element, in bytes.
   If a driver does not support first/last_element, it should return 0.
+* ``PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY``: Whether the driver only
+  supports R, RG, RGB and RGBA formats for PIPE_BUFFER sampler views.
+  When this is the case it should be assumed that the swizzle parameters
+  in the sampler view have no effect.
 * ``PIPE_CAP_TGSI_TEXCOORD``: This CAP describes a hw limitation.
   If true, the hardware cannot replace arbitrary shader inputs with sprite
   coordinates and hence the inputs that are desired to be replaceable must
@@ -164,7 +168,7 @@ The integer capabilities:
   view it is intended to be used with, or herein undefined results may occur
   for permutational swizzles.
 * ``PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE``: The maximum accessible size with
-  a buffer sampler view, in bytes.
+  a buffer sampler view, in texels.
 * ``PIPE_CAP_MAX_VIEWPORTS``: The maximum number of viewports (and scissors
   since they are linked) a driver can support. Returning 0 is equivalent
   to returning 1 because every driver has to support at least a single
@@ -306,6 +310,15 @@ The integer capabilities:
 * ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
   is supported.
 * ``PIPE_CAP_STRING_MARKER``: Whether pipe->emit_string_marker() is supported.
+* ``PIPE_CAP_SURFACE_REINTERPRET_BLOCKS``: Indicates whether
+  pipe_context::create_surface supports reinterpreting a texture as a surface
+  of a format with different block width/height (but same block size in bits).
+  For example, a compressed texture image can be interpreted as a
+  non-compressed surface whose texels are the same number of bits as the
+  compressed blocks, and vice versa. The width and height of the surface is
+  adjusted appropriately.
+* ``PIPE_CAP_QUERY_BUFFER_OBJECT``: Driver supports
+  context::get_query_result_resource callback.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 7810a3eb915..489cbb0bc2f 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2372,6 +2372,23 @@ programs.
   the program.  Results are unspecified if any of the remaining
   threads terminates or never reaches an executed BARRIER instruction.
 
+.. opcode:: MEMBAR - Memory barrier
+
+  ``MEMBAR type``
+
+  This opcode waits for the completion of all memory accesses based on
+  the type passed in. The type is an immediate bitfield with the following
+  meaning:
+
+  Bit 0: Shader storage buffers
+  Bit 1: Atomic buffers
+  Bit 2: Images
+  Bit 3: Shared memory
+  Bit 4: Thread group
+
+  These may be passed in in any combination. An implementation is free to not
+  distinguish between these as it sees fit. However these map to all the
+  possibilities made available by GLSL.
 
 .. _atomopcodes:
 
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index c5ea86f9368..c54bb1091f7 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -152,6 +152,9 @@ fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len)
 	struct fd_ringbuffer *ring = ctx->ring;
 	const uint32_t *buf = (const void *)string;
 
+	/* max packet size is 0x3fff dwords: */
+	len = MIN2(len, 0x3fff * 4);
+
 	OUT_PKT3(ring, CP_NOP, align(len, 4) / 4);
 	while (len >= 4) {
 		OUT_RING(ring, *buf);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 640f50f5dcb..27f4d267438 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -165,6 +165,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_COMPUTE:
+	case PIPE_CAP_QUERY_MEMORY_INFO:
 		return 0;
 
 	case PIPE_CAP_SM3:
@@ -183,6 +184,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLIP_HALFZ:
 		return is_a3xx(screen) || is_a4xx(screen);
 
+	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+		return 0;
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
 		if (is_a3xx(screen)) return 16;
 		if (is_a4xx(screen)) return 32;
@@ -248,6 +251,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_INVALIDATE_BUFFER:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -296,6 +300,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	/* Queries. */
 	case PIPE_CAP_QUERY_TIME_ELAPSED:
 	case PIPE_CAP_QUERY_TIMESTAMP:
+	case PIPE_CAP_QUERY_BUFFER_OBJECT:
 		return 0;
 	case PIPE_CAP_OCCLUSION_QUERY:
 		return is_a3xx(screen) || is_a4xx(screen);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 1ea2dd9cbf7..6eb6a2d52ef 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -556,6 +556,10 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp)
 	}
 }
 
+/* NOTE: this creates the "TGSI" style fragface (ie. input slot
+ * VARYING_SLOT_FACE).  For NIR style nir_intrinsic_load_front_face
+ * we can just use the value from hw directly (since it is boolean)
+ */
 static struct ir3_instruction *
 create_frag_face(struct ir3_compile *ctx, unsigned comp)
 {
@@ -1224,7 +1228,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_vertex_id_zero_base:
 		if (!ctx->vertex_id) {
-			ctx->vertex_id = create_input(ctx->block, 0);
+			ctx->vertex_id = create_input(b, 0);
 			add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
 					ctx->vertex_id);
 		}
@@ -1232,7 +1236,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_instance_id:
 		if (!ctx->instance_id) {
-			ctx->instance_id = create_input(ctx->block, 0);
+			ctx->instance_id = create_input(b, 0);
 			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
 					ctx->instance_id);
 		}
@@ -1244,6 +1248,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
 		}
 		break;
+	case nir_intrinsic_load_front_face:
+		if (!ctx->frag_face) {
+			ctx->so->frag_face = true;
+			ctx->frag_face = create_input(b, 0);
+			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+		}
+		dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard: {
 		struct ir3_instruction *cond, *kill;
@@ -1349,6 +1361,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	struct ir3_block *b = ctx->block;
 	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
 	struct ir3_instruction **coord, *lod, *compare, *proj, **off, **ddx, **ddy;
+	struct ir3_instruction *const_off[4];
 	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
 	unsigned i, coords, flags;
 	unsigned nsrc0 = 0, nsrc1 = 0;
@@ -1392,7 +1405,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 			ddy = get_src(ctx, &tex->src[i].src);
 			break;
 		default:
-			compile_error(ctx, "Unhandled NIR tex serc type: %d\n",
+			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
 					tex->src[i].src_type);
 			return;
 		}
@@ -1417,6 +1430,21 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 
 	tex_info(tex, &flags, &coords);
 
+	if (!has_off) {
+		/* could still have a constant offset: */
+		if (tex->const_offset[0] || tex->const_offset[1] ||
+				tex->const_offset[2] || tex->const_offset[3]) {
+			off = const_off;
+
+			off[0] = create_immed(b, tex->const_offset[0]);
+			off[1] = create_immed(b, tex->const_offset[1]);
+			off[2] = create_immed(b, tex->const_offset[2]);
+			off[3] = create_immed(b, tex->const_offset[3]);
+
+			has_off = true;
+		}
+	}
+
 	/* scale up integer coords for TXF based on the LOD */
 	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
 		assert(has_lod);
@@ -2053,6 +2081,9 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 		case VARYING_SLOT_CLIP_DIST0:
 		case VARYING_SLOT_CLIP_DIST1:
 			break;
+		case VARYING_SLOT_CLIP_VERTEX:
+			/* handled entirely in nir_lower_clip: */
+			return;
 		default:
 			if (slot >= VARYING_SLOT_VAR0)
 				break;
@@ -2135,11 +2166,17 @@ emit_instructions(struct ir3_compile *ctx)
 		setup_output(ctx, var);
 	}
 
-	/* Setup variables (which should only be arrays): */
+	/* Setup global variables (which should only be arrays): */
 	nir_foreach_variable(var, &ctx->s->globals) {
 		declare_var(ctx, var);
 	}
 
+	/* Setup local variables (which should only be arrays): */
+	/* NOTE: need to do something more clever when we support >1 fxn */
+	nir_foreach_variable(var, &fxn->locals) {
+		declare_var(ctx, var);
+	}
+
 	/* And emit the body: */
 	ctx->impl = fxn;
 	emit_function(ctx, fxn);
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 6b0ab587001..8d010f9dc8c 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -262,6 +262,9 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 5171cca9ea6..44d7c11af43 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -428,6 +428,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CUBE_MAP_ARRAY:
    case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
       return true;
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+      return 0;
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
       return 1;
    case PIPE_CAP_TGSI_TEXCOORD:
@@ -486,6 +488,9 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index db45cbbb057..34008e1c01e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -308,17 +308,4 @@ void
 lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 
 
-#ifdef PIPE_ARCH_SSE
-#include <emmintrin.h>
-#include "util/u_sse.h"
-
-static inline __m128i
-lp_plane_to_m128i(const struct lp_rast_plane *plane)
-{
-   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
-                         (int32_t)plane->dcdy, (int32_t)plane->eo);
-}
-
-#endif
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 0ae6ec28d35..f4a2f0268f0 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
+                         const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    const struct lp_rast_plane *plane = GET_PLANES(tri);
@@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    unsigned nr = 0;
 
-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    __m128i zero = _mm_setzero_si128();
 
-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
-   __m128i rej4;
-
-   __m128i dcdx2;
-   __m128i dcdx3;
+   __m128i c, dcdx, dcdy, rej4;
+   __m128i dcdx_neg_mask, dcdy_neg_mask;
+   __m128i dcdx2, dcdx3;
    
    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
-   
+
    transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &rej4);
+                    &c, &unused, &dcdx, &dcdy);
+
+   /* recalc eo - easier than trying to load as scalars / shuffle... */
+   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                        _mm_and_si128(dcdx_neg_mask, dcdx));
 
    /* Adjust dcdx;
     */
@@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
 
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                     const union lp_rast_cmd_arg arg)
+                        const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    const struct lp_rast_plane *plane = GET_PLANES(tri);
    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
 
-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    __m128i zero = _mm_setzero_si128();
 
-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
+   __m128i c, dcdx, dcdy;
+   __m128i dcdx2, dcdx3;
 
-   __m128i dcdx2;
-   __m128i dcdx3;
-   
    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
 
    transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &unused);
+                    &c, &unused, &dcdx, &dcdy);
 
    /* Adjust dcdx;
     */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 879a2e7d2f0..2c66bf46332 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -311,6 +311,10 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 03bb8ce2b6f..5ab297d7e1a 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -168,6 +168,21 @@ struct lp_setup_context
                      const float (*v2)[4]);
 };
 
+static inline void
+scissor_planes_needed(boolean scis_planes[4], struct u_rect *bbox,
+                      struct u_rect *scissor)
+{
+   /* left */
+   scis_planes[0] = (bbox->x0 < scissor->x0);
+   /* right */
+   scis_planes[1] = (bbox->x1 > scissor->x1);
+   /* top */
+   scis_planes[2] = (bbox->y0 < scissor->y0);
+   /* bottom */
+   scis_planes[3] = (bbox->y1 > scissor->y1);
+}
+
+
 void lp_setup_choose_triangle( struct lp_setup_context *setup );
 void lp_setup_choose_line( struct lp_setup_context *setup );
 void lp_setup_choose_point( struct lp_setup_context *setup );
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index f425825fc2a..af4e7900d3c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
       layer = MIN2(layer, scene->fb_max_layer);
    }
 
-   if (setup->scissor_test) {
-      nr_planes = 8;
-   }
-   else {
-      nr_planes = 4;
-   }
-
    dx = v1[0][0] - v2[0][0];
    dy = v1[0][1] - v2[0][1];
    area = (dx * dx  + dy * dy);
@@ -591,6 +584,18 @@ try_setup_line( struct lp_setup_context *setup,
    bbox.x0 = MAX2(bbox.x0, 0);
    bbox.y0 = MAX2(bbox.y0, 0);
 
+   nr_planes = 4;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]);
+      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
+   }
+
    line = lp_setup_alloc_triangle(scene,
                                   key->num_inputs,
                                   nr_planes,
@@ -708,30 +713,46 @@ try_setup_line( struct lp_setup_context *setup,
     * Note that otherwise, the scissor planes only vary in 'C' value,
     * and even then only on state-changes.  Could alternatively store
     * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
     */
-   if (nr_planes == 8) {
-      const struct u_rect *scissor =
-         &setup->scissors[viewport_index];
+   if (nr_planes > 4) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      struct lp_rast_plane *plane_s = &plane[4];
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, scissor);
 
-      plane[4].dcdx = -1 << 8;
-      plane[4].dcdy = 0;
-      plane[4].c = (1-scissor->x0) << 8;
-      plane[4].eo = 1 << 8;
-
-      plane[5].dcdx = 1 << 8;
-      plane[5].dcdy = 0;
-      plane[5].c = (scissor->x1+1) << 8;
-      plane[5].eo = 0;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = 1 << 8;
-      plane[6].c = (1-scissor->y0) << 8;
-      plane[6].eo = 1 << 8;
-
-      plane[7].dcdx = 0;
-      plane[7].dcdy = -1 << 8;
-      plane[7].c = (scissor->y1+1) << 8;
-      plane[7].eo = 0;
+      if (s_planes[0]) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[1]) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (s_planes[2]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[3]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
    }
 
    return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 907129dbd1b..cdb3d015dec 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
       layer = MIN2(layer, scene->fb_max_layer);
    }
 
-   if (setup->scissor_test) {
-      nr_planes = 7;
-   }
-   else {
-      nr_planes = 3;
-   }
-
    /* Bounding rectangle (in pixels) */
    {
       /* Yes this is necessary to accurately calculate bounding boxes
@@ -347,6 +340,18 @@ do_triangle_ccw(struct lp_setup_context *setup,
    bbox.x0 = MAX2(bbox.x0, 0);
    bbox.y0 = MAX2(bbox.y0, 0);
 
+   nr_planes = 3;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]);
+      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
+   }
+
    tri = lp_setup_alloc_triangle(scene,
                                  key->num_inputs,
                                  nr_planes,
@@ -367,13 +372,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    /* Setup parameter interpolants:
     */
-   setup->setup.variant->jit_function( v0,
-				       v1,
-				       v2,
-				       frontfacing,
-				       GET_A0(&tri->inputs),
-				       GET_DADX(&tri->inputs),
-				       GET_DADY(&tri->inputs) );
+   setup->setup.variant->jit_function(v0, v1, v2,
+                                      frontfacing,
+                                      GET_A0(&tri->inputs),
+                                      GET_DADX(&tri->inputs),
+                                      GET_DADY(&tri->inputs));
 
    tri->inputs.frontfacing = frontfacing;
    tri->inputs.disable = FALSE;
@@ -383,9 +386,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    if (0)
       lp_dump_setup_coef(&setup->setup.variant->key,
-			 (const float (*)[4])GET_A0(&tri->inputs),
-			 (const float (*)[4])GET_DADX(&tri->inputs),
-			 (const float (*)[4])GET_DADY(&tri->inputs));
+                         (const float (*)[4])GET_A0(&tri->inputs),
+                         (const float (*)[4])GET_DADX(&tri->inputs),
+                         (const float (*)[4])GET_DADY(&tri->inputs));
 
    plane = GET_PLANES(tri);
 
@@ -672,29 +675,46 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * Note that otherwise, the scissor planes only vary in 'C' value,
     * and even then only on state-changes.  Could alternatively store
     * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
     */
-   if (nr_planes == 7) {
-      const struct u_rect *scissor = &setup->scissors[viewport_index];
+   if (nr_planes > 3) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      struct lp_rast_plane *plane_s = &plane[3];
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, scissor);
 
-      plane[3].dcdx = -1 << 8;
-      plane[3].dcdy = 0;
-      plane[3].c = (1-scissor->x0) << 8;
-      plane[3].eo = 1 << 8;
-
-      plane[4].dcdx = 1 << 8;
-      plane[4].dcdy = 0;
-      plane[4].c = (scissor->x1+1) << 8;
-      plane[4].eo = 0;
-
-      plane[5].dcdx = 0;
-      plane[5].dcdy = 1 << 8;
-      plane[5].c = (1-scissor->y0) << 8;
-      plane[5].eo = 1 << 8;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = -1 << 8;
-      plane[6].c = (scissor->y1+1) << 8;
-      plane[6].eo = 0;
+      if (s_planes[0]) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[1]) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (s_planes[2]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[3]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
    }
 
    return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
@@ -984,17 +1004,16 @@ calc_fixed_position(struct lp_setup_context *setup,
     * Both should be acceptable, I think.
     */
 #if defined(PIPE_ARCH_SSE)
-   __m128d v0r, v1r, v2r;
+   __m128 v0r, v1r;
    __m128 vxy0xy2, vxy1xy0;
    __m128i vxy0xy2i, vxy1xy0i;
    __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
    __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
    __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
-   v0r = _mm_load_sd((const double *)v0[0]);
-   v1r = _mm_load_sd((const double *)v1[0]);
-   v2r = _mm_load_sd((const double *)v2[0]);
-   vxy0xy2 = _mm_castpd_ps(_mm_unpacklo_pd(v0r, v2r));
-   vxy1xy0 = _mm_castpd_ps(_mm_unpacklo_pd(v1r, v0r));
+   v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
+   vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
+   v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
+   vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
    vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
    vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
    vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 6ad9dd31681..75e5fd843c2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -393,6 +393,9 @@ ImmediateValue::isInteger(const int i) const
    case TYPE_S32:
    case TYPE_U32:
       return reg.data.s32 == i; // as if ...
+   case TYPE_S64:
+   case TYPE_U64:
+      return reg.data.s64 == i; // as if ...
    case TYPE_F32:
       return reg.data.f32 == static_cast<float>(i);
    case TYPE_F64:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index d1fdd75495f..9d7becf27d4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -132,6 +132,7 @@ enum operation
    OP_SUBFM,   // surface bitfield manipulation
    OP_SUCLAMP, // clamp surface coordinates
    OP_SUEAU,   // surface effective address
+   OP_SUQ,     // surface query
    OP_MADSP,   // special integer multiply-add
    OP_TEXBAR, // texture dependency barrier
    OP_DFDX,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 17cb484d2ba..0c7cd1d8137 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1947,10 +1947,16 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_CEIL:
    case OP_FLOOR:
    case OP_TRUNC:
-   case OP_CVT:
    case OP_SAT:
       emitCVT(insn);
       break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_PREDICATE ||
+          insn->src(0).getFile() == FILE_PREDICATE)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
    case OP_RSQ:
       emitSFnOp(insn, 5 + 2 * insn->subOp);
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 1fa0eb6da6d..dee26225b7e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -673,7 +673,12 @@ CodeEmitterGM107::emitMOV()
        (insn->sType != TYPE_F32 && !longIMMD(insn->src(0)))) {
       switch (insn->src(0).getFile()) {
       case FILE_GPR:
-         emitInsn(0x5c980000);
+         if (insn->def(0).getFile() == FILE_PREDICATE) {
+            emitInsn(0x5b6a0000);
+            emitGPR (0x08);
+         } else {
+            emitInsn(0x5c980000);
+         }
          emitGPR (0x14, insn->src(0));
          break;
       case FILE_MEMORY_CONST:
@@ -684,18 +689,32 @@ CodeEmitterGM107::emitMOV()
          emitInsn(0x38980000);
          emitIMMD(0x14, 19, insn->src(0));
          break;
+      case FILE_PREDICATE:
+         emitInsn(0x50880000);
+         emitPRED(0x0c, insn->src(0));
+         emitPRED(0x1d);
+         emitPRED(0x27);
+         break;
       default:
          assert(!"bad src file");
          break;
       }
-      emitField(0x27, 4, insn->lanes);
+      if (insn->def(0).getFile() != FILE_PREDICATE &&
+          insn->src(0).getFile() != FILE_PREDICATE)
+         emitField(0x27, 4, insn->lanes);
    } else {
       emitInsn (0x01000000);
       emitIMMD (0x14, 32, insn->src(0));
       emitField(0x0c, 4, insn->lanes);
    }
 
-   emitGPR(0x00, insn->def(0));
+   if (insn->def(0).getFile() == FILE_PREDICATE) {
+      emitPRED(0x27);
+      emitPRED(0x03, insn->def(0));
+      emitPRED(0x00);
+   } else {
+      emitGPR(0x00, insn->def(0));
+   }
 }
 
 void
@@ -2684,11 +2703,7 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
       emitRAM();
       break;
    case OP_MOV:
-      if (insn->def(0).getFile() == FILE_GPR &&
-          insn->src(0).getFile() != FILE_PREDICATE)
-         emitMOV();
-      else
-         assert(!"R2P/P2R");
+      emitMOV();
       break;
    case OP_RDSV:
       emitS2R();
@@ -2700,7 +2715,10 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
    case OP_CEIL:
    case OP_TRUNC:
    case OP_CVT:
-      if (isFloatType(insn->dType)) {
+      if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE ||
+                                 insn->src(0).getFile() == FILE_PREDICATE)) {
+         emitMOV();
+      } else if (isFloatType(insn->dType)) {
          if (isFloatType(insn->sType))
             emitF2F();
          else
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 0b28047e22b..8637db91521 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2021,8 +2021,10 @@ CodeEmitterNVC0::emitATOM(const Instruction *i)
       code[0] |= 63 << 20;
    }
 
-   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
-      srcId(i->src(2), 32 + 17);
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      assert(i->src(1).getSize() == 2 * typeSizeof(i->sType));
+      code[1] |= (SDATA(i->src(1)).id + 1) << 17;
+   }
 }
 
 void
@@ -2433,10 +2435,16 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
    case OP_CEIL:
    case OP_FLOOR:
    case OP_TRUNC:
-   case OP_CVT:
    case OP_SAT:
       emitCVT(insn);
       break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_PREDICATE ||
+          insn->src(0).getFile() == FILE_PREDICATE)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
    case OP_RSQ:
       emitSFnOp(insn, 5 + 2 * insn->subOp);
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 9c4a38f291b..52ac198221d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -38,6 +38,7 @@ static nv50_ir::operation translateOpcode(uint opcode);
 static nv50_ir::DataFile translateFile(uint file);
 static nv50_ir::TexTarget translateTexture(uint texTarg);
 static nv50_ir::SVSemantic translateSysVal(uint sysval);
+static nv50_ir::CacheMode translateCacheMode(uint qualifier);
 
 class Instruction
 {
@@ -213,6 +214,12 @@ public:
 
    nv50_ir::TexInstruction::Target getTexture(const Source *, int s) const;
 
+   nv50_ir::CacheMode getCacheMode() const {
+      if (!insn->Instruction.Memory)
+         return nv50_ir::CACHE_CA;
+      return translateCacheMode(insn->Memory.Qualifier);
+   }
+
    inline uint getLabel() { return insn->Label.Label; }
 
    unsigned getSaturate() const { return insn->Instruction.Saturate; }
@@ -366,7 +373,7 @@ static nv50_ir::DataFile translateFile(uint file)
    case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
    case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
    case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
-   //case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   case TGSI_FILE_BUFFER:          return nv50_ir::FILE_MEMORY_GLOBAL;
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_NULL:
    default:
@@ -436,6 +443,15 @@ static nv50_ir::TexTarget translateTexture(uint tex)
    }
 }
 
+static nv50_ir::CacheMode translateCacheMode(uint qualifier)
+{
+   if (qualifier & TGSI_MEMORY_VOLATILE)
+      return nv50_ir::CACHE_CV;
+   if (qualifier & TGSI_MEMORY_COHERENT)
+      return nv50_ir::CACHE_CG;
+   return nv50_ir::CACHE_CA;
+}
+
 nv50_ir::DataType Instruction::inferSrcType() const
 {
    switch (getOpcode()) {
@@ -1210,6 +1226,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
    case TGSI_FILE_IMMEDIATE:
    case TGSI_FILE_PREDICATE:
    case TGSI_FILE_SAMPLER:
+   case TGSI_FILE_BUFFER:
       break;
    default:
       ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File);
@@ -1255,6 +1272,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
          if (insn.getDst(0).isIndirect(0))
             indirectTempArrays.insert(insn.getDst(0).getArrayId());
+      } else
+      if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+         info->io.globalAccess |= 0x2;
       }
    }
 
@@ -1264,13 +1284,10 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (src.isIndirect(0))
             indirectTempArrays.insert(src.getArrayId());
       } else
-/*
-      if (src.getFile() == TGSI_FILE_RESOURCE) {
-         if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
-            info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
+      if (src.getFile() == TGSI_FILE_BUFFER) {
+         info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
       } else
-*/
       if (src.getFile() == TGSI_FILE_OUTPUT) {
          if (src.isIndirect(0)) {
             // We don't know which one is accessed, just mark everything for
@@ -1752,7 +1769,7 @@ Converter::acquireDst(int d, int c)
    int idx = dst.getIndex(0);
    int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
-   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
+   if (dst.isMasked(c) || f == TGSI_FILE_BUFFER)
       return NULL;
 
    if (dst.isIndirect(0) ||
@@ -2222,6 +2239,28 @@ Converter::handleLOAD(Value *dst0[4])
    int c;
    std::vector<Value *> off, src, ldv, def;
 
+   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER) {
+      for (c = 0; c < 4; ++c) {
+         if (!dst0[c])
+            continue;
+
+         Value *off = fetchSrc(1, c);
+         Symbol *sym;
+         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE) {
+            off = NULL;
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, tgsi.getSrc(1).getValueU32(0, info) + 4 * c);
+         } else {
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, 4 * c);
+         }
+
+         Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
+         ld->cache = tgsi.getCacheMode();
+         if (tgsi.getSrc(0).isIndirect(0))
+            ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
+      }
+      return;
+   }
+
    getResourceCoords(off, r, 1);
 
    if (isResourceRaw(code, r)) {
@@ -2298,6 +2337,30 @@ Converter::handleSTORE()
    int c;
    std::vector<Value *> off, src, dummy;
 
+   if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+      for (c = 0; c < 4; ++c) {
+         if (!(tgsi.getDst(0).getMask() & (1 << c)))
+            continue;
+
+         Symbol *sym;
+         Value *off;
+         if (tgsi.getSrc(0).getFile() == TGSI_FILE_IMMEDIATE) {
+            off = NULL;
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c,
+                          tgsi.getSrc(0).getValueU32(0, info) + 4 * c);
+         } else {
+            off = fetchSrc(0, 0);
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, 4 * c);
+         }
+
+         Instruction *st = mkStore(OP_STORE, TYPE_U32, sym, off, fetchSrc(1, c));
+         st->cache = tgsi.getCacheMode();
+         if (tgsi.getDst(0).isIndirect(0))
+            st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
+      }
+      return;
+   }
+
    getResourceCoords(off, r, 0);
    src = off;
    const int s = src.size();
@@ -2359,6 +2422,37 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
    std::vector<Value *> defv;
    LValue *dst = getScratch();
 
+   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER) {
+      for (int c = 0; c < 4; ++c) {
+         if (!dst0[c])
+            continue;
+
+         Instruction *insn;
+         Value *off = fetchSrc(1, c), *off2 = NULL;
+         Value *sym;
+         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE)
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, tgsi.getSrc(1).getValueU32(c, info));
+         else
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, 0);
+         if (tgsi.getSrc(0).isIndirect(0))
+            off2 = fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0);
+         if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+            insn = mkOp3(OP_ATOM, ty, dst, sym, fetchSrc(2, c), fetchSrc(3, c));
+         else
+            insn = mkOp2(OP_ATOM, ty, dst, sym, fetchSrc(2, c));
+         if (tgsi.getSrc(1).getFile() != TGSI_FILE_IMMEDIATE)
+            insn->setIndirect(0, 0, off);
+         if (off2)
+            insn->setIndirect(0, 1, off2);
+         insn->subOp = subOp;
+      }
+      for (int c = 0; c < 4; ++c)
+         if (dst0[c])
+            dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+      return;
+   }
+
+
    getResourceCoords(srcv, r, 1);
 
    if (isResourceSpecial(r)) {
@@ -3103,6 +3197,14 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       geni->fixed = 1;
       geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
       break;
+   case TGSI_OPCODE_MEMBAR:
+      geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL);
+      geni->fixed = 1;
+      if (tgsi.getSrc(0).getValueU32(0, info) & TGSI_MEMBAR_THREAD_GROUP)
+         geni->subOp = NV50_IR_SUBOP_MEMBAR(M, CTA);
+      else
+         geni->subOp = NV50_IR_SUBOP_MEMBAR(M, GL);
+      break;
    case TGSI_OPCODE_ATOMUADD:
    case TGSI_OPCODE_ATOMXCHG:
    case TGSI_OPCODE_ATOMCAS:
@@ -3115,6 +3217,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_ATOMIMAX:
       handleATOM(dst0, dstTy, tgsi::opcodeToSubOp(tgsi.getOpcode()));
       break;
+   case TGSI_OPCODE_RESQ:
+      geni = mkOp1(OP_SUQ, TYPE_U32, dst0[0],
+                   makeSym(TGSI_FILE_BUFFER, tgsi.getSrc(0).getIndex(0), -1, 0, 0));
+      if (tgsi.getSrc(0).isIndirect(0))
+         geni->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
+      break;
    case TGSI_OPCODE_IBFE:
    case TGSI_OPCODE_UBFE:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index dc1ab769b98..e7cb54bc426 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1022,11 +1022,22 @@ NVC0LoweringPass::handleTXLQ(TexInstruction *i)
    return true;
 }
 
+bool
+NVC0LoweringPass::handleSUQ(Instruction *suq)
+{
+   suq->op = OP_MOV;
+   suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1),
+                                  suq->getSrc(0)->reg.fileIndex * 16));
+   suq->setIndirect(0, 0, NULL);
+   suq->setIndirect(0, 1, NULL);
+   return true;
+}
 
 bool
 NVC0LoweringPass::handleATOM(Instruction *atom)
 {
    SVSemantic sv;
+   Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
 
    switch (atom->src(0).getFile()) {
    case FILE_MEMORY_LOCAL:
@@ -1037,16 +1048,22 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
       break;
    default:
       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
+      base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
+      assert(base->reg.size == 8);
+      if (ptr)
+         base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
+      assert(base->reg.size == 8);
+      atom->setIndirect(0, 0, base);
       return true;
    }
-   Value *base =
+   base =
       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
-   Value *ptr = atom->getIndirect(0, 0);
 
    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
    if (ptr)
       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
+   atom->setIndirect(0, 1, NULL);
    atom->setIndirect(0, 0, base);
 
    return true;
@@ -1069,7 +1086,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
          cctl->setPredicate(cas->cc, cas->getPredicate());
    }
 
-   if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
       // should be set to the high part of the double reg or bad things will
       // happen elsewhere in the universe.
@@ -1079,6 +1096,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
       bld.setPosition(cas, false);
       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
       cas->setSrc(1, dreg);
+      cas->setSrc(2, dreg);
    }
 
    return true;
@@ -1093,6 +1111,32 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 }
 
+inline Value *
+NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   off += prog->driver->io.suInfoBase;
+
+   if (ptr)
+      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
+
+   return bld.
+      mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
+}
+
+inline Value *
+NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   off += prog->driver->io.suInfoBase;
+
+   if (ptr)
+      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
+
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
+}
+
 inline Value *
 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 {
@@ -1786,6 +1830,7 @@ NVC0LoweringPass::visit(Instruction *i)
       return handleRDSV(i);
    case OP_WRSV:
       return handleWRSV(i);
+   case OP_STORE:
    case OP_LOAD:
       if (i->src(0).getFile() == FILE_SHADER_INPUT) {
          if (prog->getType() == Program::TYPE_COMPUTE) {
@@ -1820,6 +1865,26 @@ NVC0LoweringPass::visit(Instruction *i)
       } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
          assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
          i->op = OP_VFETCH;
+      } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+         Value *ind = i->getIndirect(0, 1);
+         Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+         // XXX come up with a way not to do this for EVERY little access but
+         // rather to batch these up somehow. Unfortunately we've lost the
+         // information about the field width by the time we get here.
+         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+         Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+         Value *pred = new_LValue(func, FILE_PREDICATE);
+         if (i->src(0).isIndirect(0)) {
+            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+         }
+         i->setIndirect(0, 1, NULL);
+         i->setIndirect(0, 0, ptr);
+         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+         i->setPredicate(CC_NOT_P, pred);
+         if (i->defExists(0)) {
+            bld.mkMov(i->getDef(0), bld.mkImm(0));
+         }
       }
       break;
    case OP_ATOM:
@@ -1838,6 +1903,9 @@ NVC0LoweringPass::visit(Instruction *i)
       if (targ->getChipset() >= NVISA_GK104_CHIPSET)
          handleSurfaceOpNVE4(i->asTex());
       break;
+   case OP_SUQ:
+      handleSUQ(i);
+      break;
    default:
       break;
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index adb400a559a..09ec7e69ddc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -101,6 +101,7 @@ protected:
    bool handleTXQ(TexInstruction *);
    virtual bool handleManualTXD(TexInstruction *);
    bool handleTXLQ(TexInstruction *);
+   bool handleSUQ(Instruction *);
    bool handleATOM(Instruction *);
    bool handleCasExch(Instruction *, bool needCctl);
    void handleSurfaceOpNVE4(TexInstruction *);
@@ -116,6 +117,8 @@ private:
    void readTessCoord(LValue *dst, int c);
 
    Value *loadResInfo32(Value *ptr, uint32_t off);
+   Value *loadResInfo64(Value *ptr, uint32_t off);
+   Value *loadResLength32(Value *ptr, uint32_t off);
    Value *loadMsInfo32(Value *ptr, uint32_t off);
    Value *loadTexHandle(Value *ptr, unsigned int slot);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 95e9fdfc57d..05b8db4a3d8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -336,6 +336,7 @@ private:
    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
    void opnd(Instruction *, ImmediateValue&, int s);
+   void opnd3(Instruction *, ImmediateValue&);
 
    void unary(Instruction *, const ImmediateValue&);
 
@@ -388,6 +389,8 @@ ConstantFolding::visit(BasicBlock *bb)
       else
       if (i->srcExists(1) && i->src(1).getImmediate(src1))
          opnd(i, src1, 1);
+      if (i->srcExists(2) && i->src(2).getImmediate(src2))
+         opnd3(i, src2);
    }
    return true;
 }
@@ -872,6 +875,24 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
    }
 }
 
+void
+ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
+{
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA:
+      if (imm2.isInteger(0)) {
+         i->op = OP_MUL;
+         i->setSrc(2, NULL);
+         foldCount++;
+         return;
+      }
+      break;
+   default:
+      return;
+   }
+}
+
 void
 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 {
@@ -1202,6 +1223,14 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
          }
          break;
+      case OP_SHR:
+         if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
+            bld.setPosition(i, false);
+            i->op = OP_AND;
+            i->setSrc(0, si->getSrc(0));
+            i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
+         }
+         break;
       case OP_MUL:
          int muls;
          if (isFloatType(si->dType))
@@ -2504,6 +2533,12 @@ MemoryOpt::runOpt(BasicBlock *bb)
          }
       } else
       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+         if (typeSizeof(ldst->dType) == 4 &&
+             ldst->src(1).getFile() == FILE_GPR &&
+             ldst->getSrc(1)->getInsn()->op == OP_NOP) {
+            delete_Instruction(prog, ldst);
+            continue;
+         }
          isLoad = false;
       } else {
          // TODO: maybe have all fixed ops act as barrier ?
@@ -3015,7 +3050,7 @@ Instruction::isResultEqual(const Instruction *that) const
    if (that->srcExists(s))
       return false;
 
-   if (op == OP_LOAD || op == OP_VFETCH) {
+   if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
       switch (src(0).getFile()) {
       case FILE_MEMORY_CONST:
       case FILE_SHADER_INPUT:
@@ -3046,6 +3081,8 @@ GlobalCSE::visit(BasicBlock *bb)
       ik = phi->getSrc(0)->getInsn();
       if (!ik)
          continue; // probably a function input
+      if (ik->defCount(0xff) > 1)
+         continue; // too painful to check if we can really push this forward
       for (s = 1; phi->srcExists(s); ++s) {
          if (phi->getSrc(s)->refCount() > 1)
             break;
@@ -3179,10 +3216,10 @@ DeadCodeElim::buryAll(Program *prog)
 bool
 DeadCodeElim::visit(BasicBlock *bb)
 {
-   Instruction *next;
+   Instruction *prev;
 
-   for (Instruction *i = bb->getFirst(); i; i = next) {
-      next = i->next;
+   for (Instruction *i = bb->getExit(); i; i = prev) {
+      prev = i->prev;
       if (i->isDead()) {
          ++deadCount;
          delete_Instruction(prog, i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index 0b02599dbdd..47285a25c33 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -161,6 +161,7 @@ const char *operationStr[OP_LAST + 1] =
    "subfm",
    "suclamp",
    "sueau",
+   "suq",
    "madsp",
    "texbar",
    "dfdx",
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index cd8c42ced5e..de39be872e4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1544,6 +1544,9 @@ GCRA::cleanup(const bool success)
 
    delete[] nodes;
    nodes = NULL;
+   hi.next = hi.prev = &hi;
+   lo[0].next = lo[0].prev = &lo[0];
+   lo[1].next = lo[1].prev = &lo[1];
 }
 
 Symbol *
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 4390a726d1c..ae0a8bb61d1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -46,7 +46,7 @@ const uint8_t Target::operationSrcNr[] =
    1, 1, 1,                // TEX, TXB, TXL,
    1, 1, 1, 1, 1, 1, 2,    // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP
    1, 1, 2, 2, 2, 2, 2,    // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA
-   3, 3, 3, 3,             // SUBFM, SUCLAMP, SUEAU, MADSP
+   3, 3, 3, 1, 3,          // SUBFM, SUCLAMP, SUEAU, SUQ, MADSP
    0,                      // TEXBAR
    1, 1,                   // DFDX, DFDY
    1, 2, 1, 2, 0, 0,       // RDSV, WRSV, PIXLD, QUADOP, QUADON, QUADPOP
@@ -109,8 +109,8 @@ const OpClass Target::operationClass[] =
    // SULDB, SULDP, SUSTB, SUSTP; SUREDB, SUREDP, SULEA
    OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_ATOMIC, OPCLASS_SURFACE,
    OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_SURFACE,
-   // SUBFM, SUCLAMP, SUEAU, MADSP
-   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
+   // SUBFM, SUCLAMP, SUEAU, SUQ, MADSP
+   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
    // TEXBAR
    OPCLASS_OTHER,
    // DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
index a3d07deeb18..c6c287bb8bb 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@@ -266,7 +266,9 @@ nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
    int i;
 
    for (i = 0; i < num_buffers; ++i) {
+#ifndef NDEBUG
       assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]);
+#endif
       memcpy(dec->bsp_ptr, data[i], num_bytes[i]);
       dec->bsp_ptr += num_bytes[i];
       str_bsp->w0[0] += num_bytes[i];
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 61d91fd4cce..b62889119c5 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -184,6 +184,10 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 888d62e1c52..a67ef28abf8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -369,7 +369,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
       NOUVEAU_ERR("shader translation failed: %i\n", ret);
       goto out;
    }
-   FREE(info->bin.syms);
 
    prog->code = info->bin.code;
    prog->code_size = info->bin.codeSize;
@@ -403,10 +402,13 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
          break;
       }
       prog->gp.vert_count = info->prop.gp.maxVertices;
-   } else
+   }
+
    if (prog->type == PIPE_SHADER_COMPUTE) {
       prog->cp.syms = info->bin.syms;
       prog->cp.num_syms = info->bin.numSyms;
+   } else {
+      FREE(info->bin.syms);
    }
 
    if (prog->pipe.stream_output.num_outputs)
@@ -507,6 +509,9 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
    FREE(p->interps);
    FREE(p->so);
 
+   if (type == PIPE_SHADER_COMPUTE)
+      FREE(p->cp.syms);
+
    memset(p, 0, sizeof(*p));
 
    p->pipe = pipe;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 32da60e0a23..14d0085975b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -227,6 +227,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 86be1b4c4ed..ec5cf376227 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -594,6 +594,82 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
    PUSH_DATA (push, nv50->rt_array_mode);
 }
 
+static void
+nv50_clear_buffer_push(struct pipe_context *pipe,
+                       struct pipe_resource *res,
+                       unsigned offset, unsigned size,
+                       const void *data, int data_size)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(res);
+   unsigned count = (size + 3) / 4;
+   unsigned xcoord = offset & 0xff;
+   unsigned tmp, i;
+
+   if (data_size == 1) {
+      tmp = *(unsigned char *)data;
+      tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
+      data = &tmp;
+      data_size = 4;
+   } else if (data_size == 2) {
+      tmp = *(unsigned short *)data;
+      tmp = (tmp << 16) | tmp;
+      data = &tmp;
+      data_size = 4;
+   }
+
+   unsigned data_words = data_size / 4;
+
+   nouveau_bufctx_refn(nv50->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nv50->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   offset &= ~0xff;
+
+   BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_2D(DST_PITCH), 5);
+   PUSH_DATA (push, 262144);
+   PUSH_DATA (push, 65536);
+   PUSH_DATA (push, 1);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
+   BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10);
+   PUSH_DATA (push, size);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, xcoord);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
+
+      BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx, 0);
+}
+
 static void
 nv50_clear_buffer(struct pipe_context *pipe,
                   struct pipe_resource *res,
@@ -643,9 +719,22 @@ nv50_clear_buffer(struct pipe_context *pipe,
 
    assert(size % data_size == 0);
 
+   if (offset & 0xff) {
+      unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
+      assert(fixup_size % data_size == 0);
+      nv50_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
+      offset += fixup_size;
+      size -= fixup_size;
+      if (!size)
+         return;
+   }
+
    elements = size / data_size;
    height = (elements + 8191) / 8192;
    width = elements / height;
+   if (height > 1)
+      width &= ~0xff;
+   assert(width > 0);
 
    BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4);
    PUSH_DATAf(push, color.f[0]);
@@ -669,13 +758,13 @@ nv50_clear_buffer(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
    PUSH_DATA (push, 1);
    BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5);
-   PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
-   PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
    PUSH_DATA (push, nv50_format_table[dst_fmt].rt);
    PUSH_DATA (push, 0);
    PUSH_DATA (push, 0);
    BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
-   PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
+   PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | align(width * data_size, 0x100));
    PUSH_DATA (push, height);
    BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
    PUSH_DATA (push, 0);
@@ -694,25 +783,20 @@ nv50_clear_buffer(struct pipe_context *pipe,
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
    PUSH_DATA (push, 0x3c);
 
-   if (width * height != elements) {
-      offset += width * height * data_size;
-      width = elements - width * height;
-      height = 1;
-      BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 2);
-      PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
-      PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
-      BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
-      PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
-      PUSH_DATA (push, height);
-      BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
-      PUSH_DATA (push, 0x3c);
-   }
-
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
-   nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   if (buf->mm) {
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   if (width * height != elements) {
+      offset += width * height * data_size;
+      width = elements - width * height;
+      nv50_clear_buffer_push(pipe, res, offset, width * data_size,
+                             data, data_size);
+   }
 
    nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index 4daa57d47bb..7f76ec66edb 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -491,3 +491,52 @@ daic_runout:
 daic_runout_check:
    branz annul $r7 #daic_runout
    bra annul #daic_restore
+
+/* NVC0_3D_MACRO_QUERY_BUFFER_WRITE:
+ *
+ * This is a combination macro for all of our query buffer object needs.
+ * It has the option to clamp results to a configurable amount, as well as
+ * to write out one or two words.
+ *
+ * We use the query engine to write out the values, and expect the query
+ * address to point to the right place.
+ *
+ * arg = clamp value (0 means unclamped). clamped means just 1 written value.
+ * parm[0] = LSB of end value
+ * parm[1] = MSB of end value
+ * parm[2] = LSB of start value
+ * parm[3] = MSB of start value
+ * parm[4] = desired sequence
+ * parm[5] = actual sequence
+ */
+.section #mme9097_query_buffer_write
+   parm $r2
+   parm $r3
+   parm $r4
+   parm $r5 maddr 0x16c2 /* QUERY_SEQUENCE */
+   parm $r6
+   parm $r7
+   mov $r6 (sub $r7 $r6) /* actual - desired */
+   mov $r6 (sbb 0x0 0x0) /* if there was underflow, not reached yet */
+   braz annul $r6 #qbw_ready
+   exit
+qbw_ready:
+   mov $r2 (sub $r2 $r4)
+   braz $r1 #qbw_postclamp
+   mov $r3 (sbb $r3 $r5)
+   branz annul $r3 #qbw_clamp
+   mov $r4 (sub $r1 $r2)
+   mov $r4 (sbb 0x0 0x0)
+   braz annul $r4 #qbw_postclamp
+qbw_clamp:
+   mov $r2 $r1
+qbw_postclamp:
+   send $r2
+   mov $r4 0x1000
+   branz annul $r1 #qbw_done
+   send (extrinsrt 0x0 $r4 0x0 0x10 0x10)
+   maddr 0x16c2 /* QUERY_SEQUENCE */
+   send $r3
+qbw_done:
+   exit send (extrinsrt 0x0 $r4 0x0 0x10 0x10)
+   nop
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index bf8625e0584..ecadf7e4d29 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -332,3 +332,36 @@ uint32_t mme9097_draw_arrays_indirect_count[] = {
 	0xfffef837,
 	0xfffdc027,
 };
+
+uint32_t mme9097_query_buffer_write[] = {
+	0x00000201,
+	0x00000301,
+/* 0x000a: qbw_ready */
+	0x00000401,
+	0x05b08551,
+/* 0x0011: qbw_clamp */
+/* 0x0012: qbw_postclamp */
+	0x00000601,
+	0x00000701,
+/* 0x0018: qbw_done */
+	0x0005be10,
+	0x00060610,
+	0x0000b027,
+	0x00000091,
+	0x00051210,
+	0x0001c807,
+	0x00075b10,
+	0x00011837,
+	0x00048c10,
+	0x00060410,
+	0x0000a027,
+	0x00000a11,
+	0x00001041,
+	0x04000411,
+	0x00010837,
+	0x84010042,
+	0x05b08021,
+	0x00001841,
+	0x840100c2,
+	0x00000011,
+};
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 162661ff2a7..547b8f5d309 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -56,6 +56,7 @@ static void
 nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    int i, s;
 
    if (flags & PIPE_BARRIER_MAPPED_BUFFER) {
@@ -90,6 +91,9 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
          }
       }
    }
+   if (flags & PIPE_BARRIER_SHADER_BUFFER) {
+      IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
+   }
 }
 
 static void
@@ -122,6 +126,10 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
          pipe_surface_reference(&nvc0->surfaces[s][i], NULL);
    }
 
+   for (s = 0; s < 6; ++s)
+      for (i = 0; i < NVC0_MAX_BUFFERS; ++i)
+         pipe_resource_reference(&nvc0->buffers[s][i].buffer, NULL);
+
    for (i = 0; i < nvc0->num_tfbbufs; ++i)
       pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);
 
@@ -180,10 +188,9 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
-   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (bind & PIPE_BIND_RENDER_TARGET) {
+   if (res->bind & PIPE_BIND_RENDER_TARGET) {
       for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
          if (nvc0->framebuffer.cbufs[i] &&
              nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -194,7 +201,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nvc0->framebuffer.zsbuf &&
           nvc0->framebuffer.zsbuf->texture == res) {
          nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -204,12 +211,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (bind & (PIPE_BIND_VERTEX_BUFFER |
-               PIPE_BIND_INDEX_BUFFER |
-               PIPE_BIND_CONSTANT_BUFFER |
-               PIPE_BIND_STREAM_OUTPUT |
-               PIPE_BIND_COMMAND_ARGS_BUFFER |
-               PIPE_BIND_SAMPLER_VIEW)) {
+   if (res->target == PIPE_BUFFER) {
       for (i = 0; i < nvc0->num_vtxbufs; ++i) {
          if (nvc0->vtxbuf[i].buffer == res) {
             nvc0->dirty |= NVC0_NEW_ARRAYS;
@@ -253,6 +255,18 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
       }
+
+      for (s = 0; s < 5; ++s) {
+      for (i = 0; i < NVC0_MAX_BUFFERS; ++i) {
+         if (nvc0->buffers[s][i].buffer == res) {
+            nvc0->buffers_dirty[s] |= 1 << i;
+            nvc0->dirty |= NVC0_NEW_BUFFERS;
+            nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_BUF);
+            if (!--ref)
+               return ref;
+         }
+      }
+      }
    }
 
    return ref;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 12195489691..4ab2ac41183 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -56,6 +56,7 @@
 #define NVC0_NEW_SURFACES     (1 << 23)
 #define NVC0_NEW_MIN_SAMPLES  (1 << 24)
 #define NVC0_NEW_TESSFACTOR   (1 << 25)
+#define NVC0_NEW_BUFFERS      (1 << 26)
 
 #define NVC0_NEW_CP_PROGRAM   (1 << 0)
 #define NVC0_NEW_CP_SURFACES  (1 << 1)
@@ -73,9 +74,10 @@
 #define NVC0_BIND_CB(s, i)   (164 + 16 * (s) + (i))
 #define NVC0_BIND_TFB         244
 #define NVC0_BIND_SUF         245
-#define NVC0_BIND_SCREEN      246
-#define NVC0_BIND_TLS         247
-#define NVC0_BIND_3D_COUNT    248
+#define NVC0_BIND_BUF         246
+#define NVC0_BIND_SCREEN      247
+#define NVC0_BIND_TLS         249
+#define NVC0_BIND_3D_COUNT    250
 
 /* compute bufctx (during launch_grid) */
 #define NVC0_BIND_CP_CB(i)     (  0 + (i))
@@ -187,10 +189,15 @@ struct nvc0_context {
 
    struct nvc0_blitctx *blit;
 
+   /* NOTE: some of these surfaces may reference buffers */
    struct pipe_surface *surfaces[2][NVC0_MAX_SURFACE_SLOTS];
    uint16_t surfaces_dirty[2];
    uint16_t surfaces_valid[2];
 
+   struct pipe_shader_buffer buffers[6][NVC0_MAX_BUFFERS];
+   uint32_t buffers_dirty[6];
+   uint32_t buffers_valid[6];
+
    struct util_dynarray global_residents;
 };
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
index 27c026b8b30..49e176cbd49 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -33,4 +33,6 @@
 
 #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT		0x00003850
 
+#define NVC0_3D_MACRO_QUERY_BUFFER_WRITE			0x00003858
+
 #endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index c3b53621630..93f211bd5fc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -554,6 +554,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
       }
       info->io.resInfoCBSlot = 15;
       info->io.sampleInfoBase = 256 + 128;
+      info->io.suInfoBase = 512;
       info->io.msInfoCBSlot = 15;
       info->io.msInfoBase = 0; /* TODO */
    }
@@ -635,6 +636,8 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    }
    */
    if (info->io.globalAccess)
+      prog->hdr[0] |= 1 << 26;
+   if (info->io.globalAccess & 0x2)
       prog->hdr[0] |= 1 << 16;
    if (info->io.fp64)
       prog->hdr[0] |= 1 << 27;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 7497317c419..d2acce7d5be 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -73,6 +73,24 @@ nvc0_get_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    return q->funcs->get_query_result(nvc0_context(pipe), q, wait, result);
 }
 
+static void
+nvc0_get_query_result_resource(struct pipe_context *pipe,
+                               struct pipe_query *pq,
+                               boolean wait,
+                               enum pipe_query_value_type result_type,
+                               int index,
+                               struct pipe_resource *resource,
+                               unsigned offset)
+{
+   struct nvc0_query *q = nvc0_query(pq);
+   if (!q->funcs->get_query_result_resource) {
+      assert(!"Unexpected lack of get_query_result_resource");
+      return;
+   }
+   q->funcs->get_query_result_resource(nvc0_context(pipe), q, wait, result_type,
+                                       index, resource, offset);
+}
+
 static void
 nvc0_render_condition(struct pipe_context *pipe,
                       struct pipe_query *pq,
@@ -129,7 +147,7 @@ nvc0_render_condition(struct pipe_context *pipe,
    }
 
    if (wait)
-      nvc0_hw_query_fifo_wait(push, q);
+      nvc0_hw_query_fifo_wait(nvc0, q);
 
    PUSH_SPACE(push, 7);
    PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
@@ -262,6 +280,7 @@ nvc0_init_query_functions(struct nvc0_context *nvc0)
    pipe->begin_query = nvc0_begin_query;
    pipe->end_query = nvc0_end_query;
    pipe->get_query_result = nvc0_get_query_result;
+   pipe->get_query_result_resource = nvc0_get_query_result_resource;
    pipe->render_condition = nvc0_render_condition;
    nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
index c46361c31aa..a887b220557 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
@@ -14,6 +14,13 @@ struct nvc0_query_funcs {
    void (*end_query)(struct nvc0_context *, struct nvc0_query *);
    boolean (*get_query_result)(struct nvc0_context *, struct nvc0_query *,
                                boolean, union pipe_query_result *);
+   void (*get_query_result_resource)(struct nvc0_context *nvc0,
+                                     struct nvc0_query *q,
+                                     boolean wait,
+                                     enum pipe_query_value_type result_type,
+                                     int index,
+                                     struct pipe_resource *resource,
+                                     unsigned offset);
 };
 
 struct nvc0_query {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index 1bed0162baf..62385884137 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -358,11 +358,119 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    return true;
 }
 
+static void
+nvc0_hw_get_query_result_resource(struct nvc0_context *nvc0,
+                                  struct nvc0_query *q,
+                                  boolean wait,
+                                  enum pipe_query_value_type result_type,
+                                  int index,
+                                  struct pipe_resource *resource,
+                                  unsigned offset)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   struct nv04_resource *buf = nv04_resource(resource);
+   unsigned stride;
+
+   assert(!hq->funcs || !hq->funcs->get_query_result);
+
+   if (index == -1) {
+      /* TODO: Use a macro to write the availability of the query */
+      if (hq->state != NVC0_HW_QUERY_STATE_READY)
+         nvc0_hw_query_update(nvc0->screen->base.client, q);
+      uint32_t ready[2] = {hq->state == NVC0_HW_QUERY_STATE_READY};
+      nvc0->base.push_cb(&nvc0->base, buf, offset,
+                         result_type >= PIPE_QUERY_TYPE_I64 ? 2 : 1,
+                         ready);
+      return;
+   }
+
+   /* If the fence guarding this query has not been emitted, that makes a lot
+    * of the following logic more complicated.
+    */
+   if (hq->is64bit && hq->fence->state < NOUVEAU_FENCE_STATE_EMITTED)
+      nouveau_fence_emit(hq->fence);
+
+   /* We either need to compute a 32- or 64-bit difference between 2 values,
+    * and then store the result as either a 32- or 64-bit value. As such let's
+    * treat all inputs as 64-bit (and just push an extra 0 for the 32-bit
+    * ones), and have one macro that clamps result to i32, u32, or just
+    * outputs the difference (no need to worry about 64-bit clamping).
+    */
+   if (hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_update(nvc0->screen->base.client, q);
+
+   if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_fifo_wait(nvc0, q);
+
+   nouveau_pushbuf_space(push, 16, 2, 0);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   PUSH_REFN (push, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
+   BEGIN_1IC0(push, NVC0_3D(MACRO_QUERY_BUFFER_WRITE), 7);
+   if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE) /* XXX what if 64-bit? */
+      PUSH_DATA(push, 0x00000001);
+   else if (result_type == PIPE_QUERY_TYPE_I32)
+      PUSH_DATA(push, 0x7fffffff);
+   else if (result_type == PIPE_QUERY_TYPE_U32)
+      PUSH_DATA(push, 0xffffffff);
+   else
+      PUSH_DATA(push, 0x00000000);
+
+   switch (q->type) {
+   case PIPE_QUERY_SO_STATISTICS:
+      stride = 2;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      stride = 12;
+      break;
+   default:
+      assert(index == 0);
+      stride = 1;
+      break;
+   }
+
+   if (hq->is64bit) {
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 * index,
+                           8 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 * (index + stride),
+                           8 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   } else {
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 4,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      PUSH_DATA(push, 0);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 + 4,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      PUSH_DATA(push, 0);
+   }
+
+   if (wait || hq->state == NVC0_HW_QUERY_STATE_READY) {
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+   } else if (hq->is64bit) {
+      PUSH_DATA(push, hq->fence->sequence);
+      nouveau_pushbuf_data(push, nvc0->screen->fence.bo, 0,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   } else {
+      PUSH_DATA(push, hq->sequence);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+}
+
 static const struct nvc0_query_funcs hw_query_funcs = {
    .destroy_query = nvc0_hw_destroy_query,
    .begin_query = nvc0_hw_begin_query,
    .end_query = nvc0_hw_end_query,
    .get_query_result = nvc0_hw_get_query_result,
+   .get_query_result_resource = nvc0_hw_get_query_result_resource,
 };
 
 struct nvc0_query *
@@ -476,8 +584,9 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 }
 
 void
-nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
+nvc0_hw_query_fifo_wait(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_hw_query *hq = nvc0_hw_query(q);
    unsigned offset = hq->offset;
 
@@ -486,9 +595,15 @@ nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
    PUSH_SPACE(push, 5);
    PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
    BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, hq->bo->offset + offset);
-   PUSH_DATA (push, hq->bo->offset + offset);
-   PUSH_DATA (push, hq->sequence);
+   if (hq->is64bit) {
+      PUSH_DATAh(push, nvc0->screen->fence.bo->offset);
+      PUSH_DATA (push, nvc0->screen->fence.bo->offset);
+      PUSH_DATA (push, hq->fence->sequence);
+   } else {
+      PUSH_DATAh(push, hq->bo->offset + offset);
+      PUSH_DATA (push, hq->bo->offset + offset);
+      PUSH_DATA (push, hq->sequence);
+   }
    PUSH_DATA (push, (1 << 12) |
               NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
index 3701eb7100f..8225755d85e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
@@ -51,6 +51,6 @@ void
 nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *,
                              unsigned);
 void
-nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *);
+nvc0_hw_query_fifo_wait(struct nvc0_context *, struct nvc0_query *);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 84dbd69b8a5..d368fda707d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -111,6 +111,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 256;
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
       return 1; /* 256 for binding as RT, but that's not possible in GL */
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+      return 16;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
    case PIPE_CAP_MAX_VIEWPORTS:
@@ -189,6 +191,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -212,10 +215,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -322,8 +327,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
-   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+      return NVC0_MAX_BUFFERS;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       return 16; /* would be 32 in linked (OpenGL-style) mode */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
@@ -676,8 +682,9 @@ nvc0_screen_create(struct nouveau_device *dev)
    push->rsvd_kick = 5;
 
    screen->base.vidmem_bindings |= PIPE_BIND_CONSTANT_BUFFER |
+      PIPE_BIND_SHADER_BUFFER |
       PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER |
-      PIPE_BIND_COMMAND_ARGS_BUFFER;
+      PIPE_BIND_COMMAND_ARGS_BUFFER | PIPE_BIND_QUERY_BUFFER;
    screen->base.sysmem_bindings |=
       PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;
 
@@ -891,9 +898,9 @@ nvc0_screen_create(struct nouveau_device *dev)
       /* TIC and TSC entries for each unit (nve4+ only) */
       /* auxiliary constants (6 user clip planes, base instance id) */
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, 512);
-      PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (i << 9));
-      PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (i << 9));
+      PUSH_DATA (push, 1024);
+      PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (i << 10));
+      PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (i << 10));
       BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
       PUSH_DATA (push, (15 << 4) | 1);
       if (screen->eng3d->oclass >= NVE4_3D_CLASS) {
@@ -913,8 +920,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 256);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
-   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
+   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
    BEGIN_1IC0(push, NVC0_3D(CB_POS), 5);
    PUSH_DATA (push, 0);
    PUSH_DATAf(push, 0.0f);
@@ -922,8 +929,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATAf(push, 0.0f);
    PUSH_DATAf(push, 0.0f);
    BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
-   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
+   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
 
    if (screen->base.drm->version >= 0x01000101) {
       ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
@@ -953,8 +960,12 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, screen->tls->size);
    BEGIN_NVC0(push, NVC0_3D(WARP_TEMP_ALLOC), 1);
    PUSH_DATA (push, 0);
+   /* Reduce likelihood of collision with real buffers by placing the hole at
+    * the top of the 4G area. This will have to be dealt with for real
+    * eventually by blocking off that area from the VM.
+    */
    BEGIN_NVC0(push, NVC0_3D(LOCAL_BASE), 1);
-   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0xff << 24);
 
    if (screen->eng3d->oclass < GM107_3D_CLASS) {
       ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL,
@@ -1039,6 +1050,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
+   MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);
 
    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
    PUSH_DATA (push, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 8b73102b98b..1a56177815c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -22,6 +22,8 @@
 
 #define NVC0_MAX_VIEWPORTS 16
 
+#define NVC0_MAX_BUFFERS 32
+
 
 struct nvc0_context;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index dc02b011bdf..382a18ef153 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -316,7 +316,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          continue;
 
       if (!targ->clean)
-         nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+         nvc0_hw_query_fifo_wait(nvc0, nvc0_query(targ->pq));
       nouveau_pushbuf_space(push, 0, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
       PUSH_DATA (push, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 24a6c222dd5..cf3d3497c78 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -1243,11 +1243,50 @@ nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader,
                        unsigned start_slot, unsigned count,
                        struct pipe_image_view **views)
 {
-#if 0
-   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views);
+}
 
-   nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
-#endif
+static void
+nvc0_bind_buffers_range(struct nvc0_context *nvc0, const unsigned t,
+                         unsigned start, unsigned nr,
+                         struct pipe_shader_buffer *pbuffers)
+{
+   const unsigned end = start + nr;
+   const unsigned mask = ((1 << nr) - 1) << start;
+   unsigned i;
+
+   assert(t < 5);
+
+   if (pbuffers) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (pbuffers[p].buffer)
+            nvc0->buffers_valid[t] |= (1 << i);
+         else
+            nvc0->buffers_valid[t] &= ~(1 << i);
+         nvc0->buffers[t][i].buffer_offset = pbuffers[p].buffer_offset;
+         nvc0->buffers[t][i].buffer_size = pbuffers[p].buffer_size;
+         pipe_resource_reference(&nvc0->buffers[t][i].buffer, pbuffers[p].buffer);
+      }
+   } else {
+      for (i = start; i < end; ++i)
+         pipe_resource_reference(&nvc0->buffers[t][i].buffer, NULL);
+      nvc0->buffers_valid[t] &= ~mask;
+   }
+   nvc0->buffers_dirty[t] |= mask;
+
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_BUF);
+}
+
+static void
+nvc0_set_shader_buffers(struct pipe_context *pipe,
+                        unsigned shader,
+                        unsigned start, unsigned nr,
+                        struct pipe_shader_buffer *buffers)
+{
+   const unsigned s = nvc0_shader_stage(shader);
+   nvc0_bind_buffers_range(nvc0_context(pipe), s, start, nr, buffers);
+
+   nvc0_context(pipe)->dirty |= NVC0_NEW_BUFFERS;
 }
 
 static inline void
@@ -1377,6 +1416,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->set_global_binding = nvc0_set_global_bindings;
    pipe->set_compute_resources = nvc0_set_compute_resources;
    pipe->set_shader_images = nvc0_set_shader_images;
+   pipe->set_shader_buffers = nvc0_set_shader_buffers;
 
    nvc0->sample_mask = ~0;
    nvc0->min_samples = 1;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index b02a590c375..c17223a1b2b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -183,9 +183,9 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
 
     ms = 1 << ms_mode;
     BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-    PUSH_DATA (push, 512);
-    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 9));
-    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 9));
+    PUSH_DATA (push, 1024);
+    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 10));
+    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 10));
     BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms);
     PUSH_DATA (push, 256 + 128);
     for (i = 0; i < ms; i++) {
@@ -317,9 +317,9 @@ nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
    struct nouveau_bo *bo = nvc0->screen->uniform_bo;
 
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-   PUSH_DATA (push, 512);
-   PUSH_DATAh(push, bo->offset + (5 << 16) + (s << 9));
-   PUSH_DATA (push, bo->offset + (5 << 16) + (s << 9));
+   PUSH_DATA (push, 1024);
+   PUSH_DATAh(push, bo->offset + (5 << 16) + (s << 10));
+   PUSH_DATA (push, bo->offset + (5 << 16) + (s << 10));
    BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1);
    PUSH_DATA (push, 256);
    PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
@@ -470,6 +470,39 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
    }
 }
 
+static void
+nvc0_validate_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   int i, s;
+
+   for (s = 0; s < 5; s++) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 1024);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (s << 10));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (s << 10));
+      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
+      PUSH_DATA (push, 512);
+      for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+         if (nvc0->buffers[s][i].buffer) {
+            struct nv04_resource *res =
+               nv04_resource(nvc0->buffers[s][i].buffer);
+            PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+            PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+            PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+            PUSH_DATA (push, 0);
+            BCTX_REFN(nvc0->bufctx_3d, BUF, res, RDWR);
+         } else {
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+         }
+      }
+   }
+
+}
+
 static void
 nvc0_validate_sample_mask(struct nvc0_context *nvc0)
 {
@@ -663,6 +696,7 @@ static struct state_validate {
     { nve4_set_tex_handles,        NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS },
     { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS },
     { nvc0_validate_surfaces,      NVC0_NEW_SURFACES },
+    { nvc0_validate_buffers,       NVC0_NEW_BUFFERS },
     { nvc0_idxbuf_validate,        NVC0_NEW_IDXBUF },
     { nvc0_tfb_validate,           NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG },
     { nvc0_validate_min_samples,   NVC0_NEW_MIN_SAMPLES },
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 4e43c4e99fd..71726d1aa59 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -357,27 +357,132 @@ nvc0_clear_render_target(struct pipe_context *pipe,
 }
 
 static void
-nvc0_clear_buffer_cpu(struct pipe_context *pipe,
-                      struct pipe_resource *res,
-                      unsigned offset, unsigned size,
-                      const void *data, int data_size)
+nvc0_clear_buffer_push_nvc0(struct pipe_context *pipe,
+                            struct pipe_resource *res,
+                            unsigned offset, unsigned size,
+                            const void *data, int data_size)
 {
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nv04_resource *buf = nv04_resource(res);
-   struct pipe_transfer *pt;
-   struct pipe_box box;
-   unsigned elements, i;
+   unsigned i;
 
-   elements = size / data_size;
+   nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);
 
-   u_box_1d(offset, size, &box);
+   unsigned count = (size + 3) / 4;
+   unsigned data_words = data_size / 4;
 
-   uint8_t *map = buf->vtbl->transfer_map(pipe, res, 0, PIPE_TRANSFER_WRITE,
-                                          &box, &pt);
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
 
-   for (i = 0; i < elements; ++i)
-      memcpy(&map[i*data_size], data, data_size);
+      if (!PUSH_SPACE(push, nr + 9))
+         break;
 
-   buf->vtbl->transfer_unmap(pipe, pt);
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+      PUSH_DATA (push, 0x100111);
+
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_NIC0(push, NVC0_M2MF(DATA), nr);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_buffer_push_nve4(struct pipe_context *pipe,
+                            struct pipe_resource *res,
+                            unsigned offset, unsigned size,
+                            const void *data, int data_size)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(res);
+   unsigned i;
+
+   nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   unsigned count = (size + 3) / 4;
+   unsigned data_words = data_size / 4;
+
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
+
+      if (!PUSH_SPACE(push, nr + 10))
+         break;
+
+      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), nr + 1);
+      PUSH_DATA (push, 0x1001);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_buffer_push(struct pipe_context *pipe,
+                       struct pipe_resource *res,
+                       unsigned offset, unsigned size,
+                       const void *data, int data_size)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   unsigned tmp;
+
+   if (data_size == 1) {
+      tmp = *(unsigned char *)data;
+      tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
+      data = &tmp;
+      data_size = 4;
+   } else if (data_size == 2) {
+      tmp = *(unsigned short *)data;
+      tmp = (tmp << 16) | tmp;
+      data = &tmp;
+      data_size = 4;
+   }
+
+   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
+      nvc0_clear_buffer_push_nvc0(pipe, res, offset, size, data, data_size);
+   else
+      nvc0_clear_buffer_push_nve4(pipe, res, offset, size, data, data_size);
 }
 
 static void
@@ -402,10 +507,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
       memcpy(&color.ui, data, 16);
       break;
    case 12:
-      /* This doesn't work, RGB32 is not a valid RT format.
-       * dst_fmt = PIPE_FORMAT_R32G32B32_UINT;
-       * memcpy(&color.ui, data, 12);
-       * memset(&color.ui[3], 0, 4);
+      /* RGB32 is not a valid RT format. This will be handled by the pushbuf
+       * uploader.
        */
       break;
    case 8:
@@ -437,14 +540,26 @@ nvc0_clear_buffer(struct pipe_context *pipe,
    assert(size % data_size == 0);
 
    if (data_size == 12) {
-      /* TODO: Find a way to do this with the GPU! */
-      nvc0_clear_buffer_cpu(pipe, res, offset, size, data, data_size);
+      nvc0_clear_buffer_push(pipe, res, offset, size, data, data_size);
       return;
    }
 
+   if (offset & 0xff) {
+      unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
+      assert(fixup_size % data_size == 0);
+      nvc0_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
+      offset += fixup_size;
+      size -= fixup_size;
+      if (!size)
+         return;
+   }
+
    elements = size / data_size;
    height = (elements + 16383) / 16384;
    width = elements / height;
+   if (height > 1)
+      width &= ~0xff;
+   assert(width > 0);
 
    if (!PUSH_SPACE(push, 40))
       return;
@@ -465,7 +580,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
    BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 9);
    PUSH_DATAh(push, buf->address + offset);
    PUSH_DATA (push, buf->address + offset);
-   PUSH_DATA (push, width * data_size);
+   PUSH_DATA (push, align(width * data_size, 0x100));
    PUSH_DATA (push, height);
    PUSH_DATA (push, nvc0_format_table[dst_fmt].rt);
    PUSH_DATA (push, NVC0_3D_RT_TILE_MODE_LINEAR);
@@ -480,24 +595,20 @@ nvc0_clear_buffer(struct pipe_context *pipe,
 
    IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
    if (width * height != elements) {
       offset += width * height * data_size;
       width = elements - width * height;
-      height = 1;
-
-      BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 4);
-      PUSH_DATAh(push, buf->address + offset);
-      PUSH_DATA (push, buf->address + offset);
-      PUSH_DATA (push, width * data_size);
-      PUSH_DATA (push, height);
-
-      IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
+      nvc0_clear_buffer_push(pipe, res, offset, width * data_size,
+                             data, data_size);
    }
 
-   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
-
-   nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
-   nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 74090ce40a5..7223f5aecfb 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -515,12 +515,12 @@ nve4_set_tex_handles(struct nvc0_context *nvc0)
       return;
    address = nvc0->screen->uniform_bo->offset + (5 << 16);
 
-   for (s = 0; s < 5; ++s, address += (1 << 9)) {
+   for (s = 0; s < 5; ++s, address += (1 << 10)) {
       uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
       if (!dirty)
          continue;
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, 512);
+      PUSH_DATA (push, 1024);
       PUSH_DATAh(push, address);
       PUSH_DATA (push, address);
       do {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index ad79d1cbb9c..44aed1adeeb 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -334,7 +334,7 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)
       b = ve->pipe.vertex_buffer_index;
       vb = &nvc0->vtxbuf[b];
 
-      if (!vb->buffer) {
+      if (nvc0->vbo_user & (1 << b)) {
          if (!(nvc0->constant_vbos & (1 << b))) {
             if (ve->pipe.instance_divisor) {
                BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_DIVISOR(i)), 1);
@@ -352,13 +352,13 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)
 
       if (unlikely(ve->pipe.instance_divisor)) {
          BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 4);
-         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
          PUSH_DATAh(push, res->address + offset);
          PUSH_DATA (push, res->address + offset);
          PUSH_DATA (push, ve->pipe.instance_divisor);
       } else {
          BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 3);
-         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
          PUSH_DATAh(push, res->address + offset);
          PUSH_DATA (push, res->address + offset);
       }
@@ -382,7 +382,7 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
    unsigned b;
    const uint32_t mask = nvc0->vbo_user;
 
-   PUSH_SPACE(push, nvc0->num_vtxbufs * 8);
+   PUSH_SPACE(push, nvc0->num_vtxbufs * 8 + nvc0->vertex->num_elements);
    for (b = 0; b < nvc0->num_vtxbufs; ++b) {
       struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[b];
       struct nv04_resource *buf;
@@ -395,6 +395,10 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
          }
          /* address/value set in nvc0_update_user_vbufs_shared */
          continue;
+      } else if (!vb->buffer) {
+         /* there can be holes in the vertex buffer lists */
+         IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
+         continue;
       }
       buf = nv04_resource(vb->buffer);
       offset = vb->buffer_offset;
@@ -410,6 +414,12 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
 
       BCTX_REFN(nvc0->bufctx_3d, VTX, buf, RD);
    }
+   /* If there are more elements than buffers, we might not have unset
+    * fetching on the later elements.
+    */
+   for (; b < nvc0->vertex->num_elements; ++b)
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
+
    if (nvc0->vbo_user)
       nvc0_update_user_vbufs_shared(nvc0);
 }
@@ -680,7 +690,7 @@ nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 
    if (count & 1) {
       count--;
-      PUSH_SPACE(push, 1);
+      PUSH_SPACE(push, 2);
       BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
       PUSH_DATA (push, *map++);
    }
@@ -779,7 +789,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       PUSH_SPACE(push, 2);
       IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
-      nvc0_hw_query_fifo_wait(push, nvc0_query(so->pq));
+      nvc0_hw_query_fifo_wait(nvc0, nvc0_query(so->pq));
       if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
 
@@ -811,6 +821,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
    uint32_t offset = buf->offset + info->indirect_offset;
 
+   PUSH_SPACE(push, 7);
+
    /* must make FIFO wait for engines idle before continuing to process */
    if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) ||
        (buf_count && buf_count->fence_wr &&
@@ -951,6 +963,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (info->mode == PIPE_PRIM_PATCHES &&
        nvc0->state.patch_vertices != info->vertices_per_patch) {
       nvc0->state.patch_vertices = info->vertices_per_patch;
+      PUSH_SPACE(push, 1);
       IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
    }
 
@@ -958,6 +971,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    nvc0_state_validate(nvc0, ~0, 8);
 
    if (nvc0->vertprog->vp.need_draw_parameters) {
+      PUSH_SPACE(push, 9);
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 512);
       PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
@@ -979,6 +993,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    if (nvc0->cb_dirty) {
+      PUSH_SPACE(push, 1);
       IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
       nvc0->cb_dirty = false;
    }
@@ -987,6 +1002,8 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nvc0->textures_coherent[s])
          continue;
 
+      PUSH_SPACE(push, nvc0->num_textures[s] * 2);
+
       for (int i = 0; i < nvc0->num_textures[s]; ++i) {
          struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
          if (!(nvc0->textures_coherent[s] & (1 << i)))
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 90c4f71a945..a2b7f87855d 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -210,6 +210,10 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_INVALIDATE_BUFFER:
         case PIPE_CAP_GENERATE_MIPMAP:
         case PIPE_CAP_STRING_MARKER:
+        case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+        case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+        case PIPE_CAP_QUERY_BUFFER_OBJECT:
+        case PIPE_CAP_QUERY_MEMORY_INFO:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 20945ece155..2cf08897a8d 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -225,7 +225,7 @@ void *evergreen_create_compute_state(
 		}
 	}
 #else
-	memset(&shader->binary, 0, sizeof(shader->binary));
+	radeon_shader_binary_init(&shader->binary);
 	radeon_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 
@@ -245,13 +245,31 @@ void *evergreen_create_compute_state(
 	return shader;
 }
 
-void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
+void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 {
-	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
+	struct r600_pipe_compute *shader = state;
 
 	if (!shader)
 		return;
 
+#ifdef HAVE_OPENCL
+#if HAVE_LLVM < 0x0306
+	for (unsigned i = 0; i < shader->num_kernels; i++) {
+		struct r600_kernel *kernel = &shader->kernels[i];
+		LLVMDisposeModule(module);
+	}
+	FREE(shader->kernels);
+	LLVMContextDispose(shader->llvm_ctx);
+#else
+	radeon_shader_binary_clean(&shader->binary);
+	r600_destroy_shader(&shader->bc);
+
+	/* TODO destroy shader->code_bo, shader->const_bo
+	 * we'll need something like r600_buffer_free */
+#endif
+#endif
 	FREE(shader);
 }
 
@@ -349,7 +367,7 @@ static void evergreen_emit_direct_dispatch(
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned num_waves;
-	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
+	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 	unsigned wave_divisor = (16 * num_pipes);
 	int group_size = 1;
 	int grid_size = 1;
@@ -723,7 +741,7 @@ static void evergreen_set_global_binding(
  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
  * packet requires that the shader type bit be set, we must initialize all
  * context registers needed for compute in this function.  The registers
- * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
+ * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
  * on the GPU family.
  */
@@ -733,7 +751,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 	int num_threads;
 	int num_stack_entries;
 
-	/* since all required registers are initialised in the
+	/* since all required registers are initialized in the
 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 	 */
 	r600_init_command_buffer(cb, 256);
@@ -818,7 +836,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
 		 */
 
-		/* XXX: We may need to adjust the thread and stack resouce
+		/* XXX: We may need to adjust the thread and stack resource
 		 * values for 3D/compute interop */
 
 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 9dfb84965cf..61d32c06671 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -772,7 +772,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		if (util_format_get_blocksize(pipe_format) >= 16)
 			non_disp_tiling = 1;
 	}
-	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rscreen->b.info.r600_num_banks);
 
 	if (state->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@@ -986,7 +986,7 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx,
 	unsigned block_size =
 		align(util_format_get_blocksize(pipe_buffer->format), 4);
 	unsigned pitch_alignment =
-		MAX2(64, rctx->screen->b.tiling_info.group_bytes / block_size);
+		MAX2(64, rctx->screen->b.info.pipe_interleave_bytes / block_size);
 	unsigned pitch = align(pipe_buffer->width0, pitch_alignment);
 
 	/* XXX: This is copied from evergreen_init_color_surface().  I don't
@@ -1098,7 +1098,7 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 		if (util_format_get_blocksize(surf->base.format) >= 16)
 			non_disp_tiling = 1;
 	}
-	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rscreen->b.info.r600_num_banks);
 	desc = util_format_description(surf->base.format);
 	for (i = 0; i < 4; i++) {
 		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
@@ -1253,7 +1253,7 @@ static void evergreen_init_depth_surface(struct r600_context *rctx,
 	macro_aspect = eg_macro_tile_aspect(macro_aspect);
 	bankw = eg_bank_wh(bankw);
 	bankh = eg_bank_wh(bankh);
-	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rscreen->b.info.r600_num_banks);
 	offset >>= 8;
 
 	surf->db_z_info = S_028040_ARRAY_MODE(array_mode) |
@@ -3467,7 +3467,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 	sub_cmd = EG_DMA_COPY_TILED;
 	lbpp = util_logbase2(bpp);
 	pitch_tile_max = ((pitch / bpp) / 8) - 1;
-	nbanks = eg_num_banks(rctx->screen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rctx->screen->b.info.r600_num_banks);
 
 	if (dst_mode == RADEON_SURF_MODE_LINEAR) {
 		/* T2L */
@@ -3670,9 +3670,9 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	unsigned id = 1;
 	unsigned i;
 	/* !!!
-	 *  To avoid GPU lockup registers must be emited in a specific order
+	 *  To avoid GPU lockup registers must be emitted in a specific order
 	 * (no kidding ...). The order below is important and have been
-	 * partialy infered from analyzing fglrx command stream.
+	 * partially inferred from analyzing fglrx command stream.
 	 *
 	 * Don't reorder atom without carefully checking the effect (GPU lockup
 	 * or piglit regression).
@@ -3793,7 +3793,7 @@ void evergreen_setup_tess_constants(struct r600_context *rctx, const struct pipe
 	unsigned output_patch0_offset, perpatch_output_offset, lds_size;
 	uint32_t values[16];
 	unsigned num_waves;
-	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
+	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 	unsigned wave_divisor = (16 * num_pipes);
 
 	*num_patches = 1;
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 0b78290295a..1629399d8fe 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -245,8 +245,8 @@ struct r600_bytecode {
 	unsigned	ar_chan;
 	unsigned        ar_handling;
 	unsigned        r6xx_nop_after_rel_dst;
-	bool		index_loaded[2];
-	unsigned 	index_reg[2]; /* indexing register CF_INDEX_[01] */
+	bool            index_loaded[2];
+	unsigned        index_reg[2]; /* indexing register CF_INDEX_[01] */
 	unsigned        debug_id;
 	struct r600_isa* isa;
 };
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 8b91372f3ae..0fe7c74418d 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -848,6 +848,7 @@ LLVMModuleRef r600_tgsi_llvm(
 
 	lp_build_tgsi_llvm(bld_base, tokens);
 
+	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
 	radeon_llvm_finalize_module(ctx);
 
 	return ctx->gallivm.module;
@@ -910,6 +911,11 @@ unsigned r600_create_shader(struct r600_bytecode *bc,
 	return 0;
 }
 
+void r600_destroy_shader(struct r600_bytecode *bc)
+{
+	FREE(bc->bytecode);
+}
+
 unsigned r600_llvm_compile(
 	LLVMModuleRef mod,
 	enum radeon_family family,
@@ -922,17 +928,14 @@ unsigned r600_llvm_compile(
 	struct radeon_shader_binary binary;
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
-	memset(&binary, 0, sizeof(struct radeon_shader_binary));
+	radeon_shader_binary_init(&binary);
 	if (dump)
 		LLVMDumpModule(mod);
 	r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
-	FREE(binary.code);
-	FREE(binary.config);
-	FREE(binary.rodata);
-	FREE(binary.global_symbol_offsets);
+	radeon_shader_binary_clean(&binary);
 
 	return r;
 }
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h
index f570b739fbe..3f7fc4bef7e 100644
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ b/src/gallium/drivers/r600/r600_llvm.h
@@ -30,6 +30,8 @@ unsigned r600_create_shader(struct r600_bytecode *bc,
 		const struct radeon_shader_binary *binary,
 		boolean *use_kill);
 
+void r600_destroy_shader(struct r600_bytecode *bc);
+
 void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
 		struct r600_bytecode *bc,
 		uint64_t symbol_offset,
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 9b0f31270df..9d378013be0 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -285,6 +285,8 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_INVALIDATE_BUFFER:
+	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+	case PIPE_CAP_QUERY_MEMORY_INFO:
 		return 1;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -342,6 +344,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		/* kernel command checker support is also required */
 		return family >= CHIP_CEDAR && rscreen->b.info.drm_minor >= 41;
 
+	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+		return family >= CHIP_CEDAR ? 0 : 1;
+
 	/* Unsupported features. */
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
@@ -364,6 +369,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
+	case PIPE_CAP_QUERY_BUFFER_OBJECT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -415,10 +421,10 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 
 	/* Timer queries, present when the clock frequency is non zero. */
 	case PIPE_CAP_QUERY_TIME_ELAPSED:
-		return rscreen->b.info.r600_clock_crystal_freq != 0;
+		return rscreen->b.info.clock_crystal_freq != 0;
 	case PIPE_CAP_QUERY_TIMESTAMP:
 		return rscreen->b.info.drm_minor >= 20 &&
-		       rscreen->b.info.r600_clock_crystal_freq != 0;
+		       rscreen->b.info.clock_crystal_freq != 0;
 
 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
 	case PIPE_CAP_MIN_TEXEL_OFFSET:
diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c
index 18d2b69afb0..0c928345773 100644
--- a/src/gallium/drivers/r600/r600_uvd.c
+++ b/src/gallium/drivers/r600/r600_uvd.c
@@ -160,7 +160,7 @@ static struct pb_buffer* r600_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_
 	struct r600_texture *chroma = (struct r600_texture *)buf->resources[1];
 
 	msg->body.decode.dt_field_mode = buf->base.interlaced;
-	msg->body.decode.dt_surf_tile_config |= RUVD_NUM_BANKS(eg_num_banks(rscreen->b.tiling_info.num_banks));
+	msg->body.decode.dt_surf_tile_config |= RUVD_NUM_BANKS(eg_num_banks(rscreen->b.info.r600_num_banks));
 
 	ruvd_set_dt_surfaces(msg, &luma->surface, &chroma->surface);
 
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index c7984c47304..b384baa9237 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -181,7 +181,7 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	old_buf = res->buf;
 	res->buf = new_buf; /* should be atomic */
 
-	if (rscreen->info.r600_virtual_address)
+	if (rscreen->info.has_virtual_memory)
 		res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf);
 	else
 		res->gpu_address = 0;
@@ -511,7 +511,7 @@ r600_buffer_from_user_memory(struct pipe_screen *screen,
 		return NULL;
 	}
 
-	if (rscreen->info.r600_virtual_address)
+	if (rscreen->info.has_virtual_memory)
 		rbuffer->gpu_address =
 			ws->buffer_get_virtual_address(rbuffer->buf);
 	else
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index caf7deef37c..ff5b055448a 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -60,7 +60,7 @@ static inline void r600_emit_reloc(struct r600_common_context *rctx,
 				   enum radeon_bo_priority priority)
 {
 	struct radeon_winsys_cs *cs = ring->cs;
-	bool has_vm = ((struct r600_common_screen*)rctx->b.screen)->info.r600_virtual_address;
+	bool has_vm = ((struct r600_common_screen*)rctx->b.screen)->info.has_virtual_memory;
 	unsigned reloc = radeon_add_to_buffer_list(rctx, ring, rbo, usage, priority);
 
 	if (!has_vm) {
diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c
index fad7bdec40a..f3529a1fe0f 100644
--- a/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -33,10 +33,6 @@
 /* Max counters per HW block */
 #define R600_QUERY_MAX_COUNTERS 16
 
-static const char * const r600_pc_shader_suffix[] = {
-	"", "_PS", "_VS", "_GS", "_ES", "_HS", "_LS", "_CS"
-};
-
 static struct r600_perfcounter_block *
 lookup_counter(struct r600_perfcounters *pc, unsigned index,
 	       unsigned *base_gid, unsigned *sub_index)
@@ -92,6 +88,8 @@ struct r600_pc_counter {
 	unsigned stride;
 };
 
+#define R600_PC_SHADERS_WINDOWING (1 << 31)
+
 struct r600_query_pc {
 	struct r600_query_hw b;
 
@@ -246,32 +244,29 @@ static struct r600_pc_group *get_group_state(struct r600_common_screen *screen,
 	if (block->flags & R600_PC_BLOCK_SHADER) {
 		unsigned sub_gids = block->num_instances;
 		unsigned shader_id;
-		unsigned shader_mask;
-		unsigned query_shader_mask;
+		unsigned shaders;
+		unsigned query_shaders;
 
 		if (block->flags & R600_PC_BLOCK_SE_GROUPS)
 			sub_gids = sub_gids * screen->info.max_se;
 		shader_id = sub_gid / sub_gids;
 		sub_gid = sub_gid % sub_gids;
 
-		if (shader_id == 0)
-			shader_mask = R600_PC_SHADER_ALL;
-		else
-			shader_mask = 1 << (shader_id - 1);
+		shaders = screen->perfcounters->shader_type_bits[shader_id];
 
-		query_shader_mask = query->shaders & R600_PC_SHADER_ALL;
-		if (query_shader_mask && query_shader_mask != shader_mask) {
+		query_shaders = query->shaders & ~R600_PC_SHADERS_WINDOWING;
+		if (query_shaders && query_shaders != shaders) {
 			fprintf(stderr, "r600_perfcounter: incompatible shader groups\n");
 			FREE(group);
 			return NULL;
 		}
-		query->shaders |= shader_mask;
+		query->shaders = shaders;
 	}
 
-	if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED) {
+	if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
 		// A non-zero value in query->shaders ensures that the shader
 		// masking is reset unless the user explicitly requests one.
-		query->shaders |= R600_PC_SHADER_WINDOWING;
+		query->shaders = R600_PC_SHADERS_WINDOWING;
 	}
 
 	if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
@@ -379,8 +374,8 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 	}
 
 	if (query->shaders) {
-		if ((query->shaders & R600_PC_SHADER_ALL) == 0)
-			query->shaders |= R600_PC_SHADER_ALL;
+		if (query->shaders == R600_PC_SHADERS_WINDOWING)
+			query->shaders = 0xffffffff;
 		query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords;
 	}
 
@@ -438,7 +433,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
 	if (block->flags & R600_PC_BLOCK_SE_GROUPS)
 		groups_se = screen->info.max_se;
 	if (block->flags & R600_PC_BLOCK_SHADER)
-		groups_shader = ARRAY_SIZE(r600_pc_shader_suffix);
+		groups_shader = screen->perfcounters->num_shader_types;
 
 	namelen = strlen(block->basename);
 	block->group_name_stride = namelen + 1;
@@ -462,14 +457,15 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
 
 	groupname = block->group_names;
 	for (i = 0; i < groups_shader; ++i) {
-		unsigned shaderlen = strlen(r600_pc_shader_suffix[i]);
+		const char *shader_suffix = screen->perfcounters->shader_type_suffixes[i];
+		unsigned shaderlen = strlen(shader_suffix);
 		for (j = 0; j < groups_se; ++j) {
 			for (k = 0; k < groups_instance; ++k) {
 				strcpy(groupname, block->basename);
 				p = groupname + namelen;
 
 				if (block->flags & R600_PC_BLOCK_SHADER) {
-					strcpy(p, r600_pc_shader_suffix[i]);
+					strcpy(p, shader_suffix);
 					p += shaderlen;
 				}
 
@@ -626,7 +622,7 @@ void r600_perfcounters_add_block(struct r600_common_screen *rscreen,
 	if (block->flags & R600_PC_BLOCK_SE_GROUPS)
 		block->num_groups *= rscreen->info.max_se;
 	if (block->flags & R600_PC_BLOCK_SHADER)
-		block->num_groups *= ARRAY_SIZE(r600_pc_shader_suffix);
+		block->num_groups *= pc->num_shader_types;
 
 	++pc->num_blocks;
 	pc->num_groups += block->num_groups;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 4c066c14cd8..d75317b1cbe 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -48,6 +48,26 @@ struct r600_multi_fence {
 	struct pipe_fence_handle *sdma;
 };
 
+/*
+ * shader binary helpers.
+ */
+void radeon_shader_binary_init(struct radeon_shader_binary *b)
+{
+	memset(b, 0, sizeof(*b));
+}
+
+void radeon_shader_binary_clean(struct radeon_shader_binary *b)
+{
+	if (!b)
+		return;
+	FREE(b->code);
+	FREE(b->config);
+	FREE(b->rodata);
+	FREE(b->global_symbol_offsets);
+	FREE(b->relocs);
+	FREE(b->disasm_string);
+}
+
 /*
  * pipe_context
  */
@@ -251,7 +271,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->chip_class = rscreen->chip_class;
 
 	if (rscreen->chip_class >= CIK)
-		rctx->max_db = MAX2(8, rscreen->info.r600_num_backends);
+		rctx->max_db = MAX2(8, rscreen->info.num_render_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
@@ -295,7 +315,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->ctx)
 		return false;
 
-	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
+	if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
 		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 						   r600_flush_dma_ring,
 						   rctx, NULL);
@@ -373,6 +393,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
 	{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
 	{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
+	{ "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" },
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -389,6 +410,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "nodcc", DBG_NO_DCC, "Disable DCC." },
 	{ "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." },
 	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
+	{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -698,7 +720,7 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 	case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
 		if (ret) {
 			uint32_t *max_clock_frequency = ret;
-			*max_clock_frequency = rscreen->info.max_sclk;
+			*max_clock_frequency = rscreen->info.max_shader_clock;
 		}
 		return sizeof(uint32_t);
 
@@ -734,7 +756,7 @@ static uint64_t r600_get_timestamp(struct pipe_screen *screen)
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 
 	return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) /
-			rscreen->info.r600_clock_crystal_freq;
+			rscreen->info.clock_crystal_freq;
 }
 
 static void r600_fence_reference(struct pipe_screen *screen,
@@ -778,116 +800,40 @@ static boolean r600_fence_finish(struct pipe_screen *screen,
 	return rws->fence_wait(rws, rfence->gfx, timeout);
 }
 
-static bool r600_interpret_tiling(struct r600_common_screen *rscreen,
-				  uint32_t tiling_config)
+static void r600_query_memory_info(struct pipe_screen *screen,
+				   struct pipe_memory_info *info)
 {
-	switch ((tiling_config & 0xe) >> 1) {
-	case 0:
-		rscreen->tiling_info.num_channels = 1;
-		break;
-	case 1:
-		rscreen->tiling_info.num_channels = 2;
-		break;
-	case 2:
-		rscreen->tiling_info.num_channels = 4;
-		break;
-	case 3:
-		rscreen->tiling_info.num_channels = 8;
-		break;
-	default:
-		return false;
-	}
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	struct radeon_winsys *ws = rscreen->ws;
+	unsigned vram_usage, gtt_usage;
 
-	switch ((tiling_config & 0x30) >> 4) {
-	case 0:
-		rscreen->tiling_info.num_banks = 4;
-		break;
-	case 1:
-		rscreen->tiling_info.num_banks = 8;
-		break;
-	default:
-		return false;
+	info->total_device_memory = rscreen->info.vram_size / 1024;
+	info->total_staging_memory = rscreen->info.gart_size / 1024;
 
-	}
-	switch ((tiling_config & 0xc0) >> 6) {
-	case 0:
-		rscreen->tiling_info.group_bytes = 256;
-		break;
-	case 1:
-		rscreen->tiling_info.group_bytes = 512;
-		break;
-	default:
-		return false;
-	}
-	return true;
-}
+	/* The real TTM memory usage is somewhat random, because:
+	 *
+	 * 1) TTM delays freeing memory, because it can only free it after
+	 *    fences expire.
+	 *
+	 * 2) The memory usage can be really low if big VRAM evictions are
+	 *    taking place, but the real usage is well above the size of VRAM.
+	 *
+	 * Instead, return statistics of this process.
+	 */
+	vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
+	gtt_usage =  ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
 
-static bool evergreen_interpret_tiling(struct r600_common_screen *rscreen,
-				       uint32_t tiling_config)
-{
-	switch (tiling_config & 0xf) {
-	case 0:
-		rscreen->tiling_info.num_channels = 1;
-		break;
-	case 1:
-		rscreen->tiling_info.num_channels = 2;
-		break;
-	case 2:
-		rscreen->tiling_info.num_channels = 4;
-		break;
-	case 3:
-		rscreen->tiling_info.num_channels = 8;
-		break;
-	default:
-		return false;
-	}
+	info->avail_device_memory =
+		vram_usage <= info->total_device_memory ?
+				info->total_device_memory - vram_usage : 0;
+	info->avail_staging_memory =
+		gtt_usage <= info->total_staging_memory ?
+				info->total_staging_memory - gtt_usage : 0;
 
-	switch ((tiling_config & 0xf0) >> 4) {
-	case 0:
-		rscreen->tiling_info.num_banks = 4;
-		break;
-	case 1:
-		rscreen->tiling_info.num_banks = 8;
-		break;
-	case 2:
-		rscreen->tiling_info.num_banks = 16;
-		break;
-	default:
-		return false;
-	}
-
-	switch ((tiling_config & 0xf00) >> 8) {
-	case 0:
-		rscreen->tiling_info.group_bytes = 256;
-		break;
-	case 1:
-		rscreen->tiling_info.group_bytes = 512;
-		break;
-	default:
-		return false;
-	}
-	return true;
-}
-
-static bool r600_init_tiling(struct r600_common_screen *rscreen)
-{
-	uint32_t tiling_config = rscreen->info.r600_tiling_config;
-
-	/* set default group bytes, overridden by tiling info ioctl */
-	if (rscreen->chip_class <= R700) {
-		rscreen->tiling_info.group_bytes = 256;
-	} else {
-		rscreen->tiling_info.group_bytes = 512;
-	}
-
-	if (!tiling_config)
-		return true;
-
-	if (rscreen->chip_class <= R700) {
-		return r600_interpret_tiling(rscreen, tiling_config);
-	} else {
-		return evergreen_interpret_tiling(rscreen, tiling_config);
-	}
+	info->device_memory_evicted =
+		ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+	/* Just return the number of evicted 64KB pages. */
+	info->nr_device_memory_evictions = info->device_memory_evicted / 64;
 }
 
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
@@ -929,6 +875,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->b.fence_reference = r600_fence_reference;
 	rscreen->b.resource_destroy = u_resource_destroy_vtbl;
 	rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
+	rscreen->b.query_memory_info = r600_query_memory_info;
 
 	if (rscreen->info.has_uvd) {
 		rscreen->b.get_video_param = rvid_get_video_param;
@@ -946,9 +893,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->chip_class = rscreen->info.chip_class;
 	rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
 
-	if (!r600_init_tiling(rscreen)) {
-		return false;
-	}
 	util_format_s3tc_init();
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
@@ -968,27 +912,34 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 
 	if (rscreen->debug_flags & DBG_INFO) {
 		printf("pci_id = 0x%x\n", rscreen->info.pci_id);
-		printf("family = %i\n", rscreen->info.family);
+		printf("family = %i (%s)\n", rscreen->info.family,
+		       r600_get_chip_name(rscreen));
 		printf("chip_class = %i\n", rscreen->info.chip_class);
-		printf("gart_size = %i MB\n", (int)(rscreen->info.gart_size >> 20));
-		printf("vram_size = %i MB\n", (int)(rscreen->info.vram_size >> 20));
-		printf("max_sclk = %i\n", rscreen->info.max_sclk);
+		printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
+		printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
+		printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
+		printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
+		printf("has_sdma = %i\n", rscreen->info.has_sdma);
+		printf("has_uvd = %i\n", rscreen->info.has_uvd);
+		printf("vce_fw_version = %i\n", rscreen->info.vce_fw_version);
+		printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config);
+		printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq);
+		printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
+		       rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
+		printf("has_userptr = %i\n", rscreen->info.has_userptr);
+
+		printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes);
+		printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock);
 		printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units);
 		printf("max_se = %i\n", rscreen->info.max_se);
 		printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
-		printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
-		       rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
-		printf("has_uvd = %i\n", rscreen->info.has_uvd);
-		printf("vce_fw_version = %i\n", rscreen->info.vce_fw_version);
-		printf("r600_num_backends = %i\n", rscreen->info.r600_num_backends);
-		printf("r600_clock_crystal_freq = %i\n", rscreen->info.r600_clock_crystal_freq);
-		printf("r600_tiling_config = 0x%x\n", rscreen->info.r600_tiling_config);
-		printf("r600_num_tile_pipes = %i\n", rscreen->info.r600_num_tile_pipes);
-		printf("r600_max_pipes = %i\n", rscreen->info.r600_max_pipes);
-		printf("r600_virtual_address = %i\n", rscreen->info.r600_virtual_address);
-		printf("r600_has_dma = %i\n", rscreen->info.r600_has_dma);
-		printf("r600_backend_map = %i\n", rscreen->info.r600_backend_map);
-		printf("r600_backend_map_valid = %i\n", rscreen->info.r600_backend_map_valid);
+
+		printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map);
+		printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid);
+		printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
+		printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
+		printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
+		printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
 		printf("si_tile_mode_array_valid = %i\n", rscreen->info.si_tile_mode_array_valid);
 		printf("cik_macrotile_mode_array_valid = %i\n", rscreen->info.cik_macrotile_mode_array_valid);
 	}
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index d66e74f9254..e92df876c22 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -71,6 +71,7 @@
 #define DBG_NO_IR		(1 << 12)
 #define DBG_NO_TGSI		(1 << 13)
 #define DBG_NO_ASM		(1 << 14)
+#define DBG_PREOPT_IR		(1 << 15)
 /* Bits 21-31 are reserved for the r600g driver. */
 /* features */
 #define DBG_NO_ASYNC_DMA	(1llu << 32)
@@ -87,6 +88,7 @@
 #define DBG_NO_DCC		(1llu << 43)
 #define DBG_NO_DCC_CLEAR	(1llu << 44)
 #define DBG_NO_RB_PLUS		(1llu << 45)
+#define DBG_SI_SCHED		(1llu << 46)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
@@ -129,6 +131,9 @@ struct radeon_shader_binary {
 	char *disasm_string;
 };
 
+void radeon_shader_binary_init(struct radeon_shader_binary *b);
+void radeon_shader_binary_clean(struct radeon_shader_binary *b);
+
 struct r600_resource {
 	struct u_resource		b;
 
@@ -257,8 +262,6 @@ struct r600_surface {
 	unsigned spi_shader_col_format_alpha;	/* SI+, alpha-to-coverage */
 	unsigned spi_shader_col_format_blend;	/* SI+, blending without alpha. */
 	unsigned spi_shader_col_format_blend_alpha; /* SI+, blending with alpha. */
-	unsigned sx_ps_downconvert;	/* Stoney only */
-	unsigned sx_blend_opt_epsilon;	/* Stoney only */
 	struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */
 	struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */
 
@@ -278,19 +281,12 @@ struct r600_surface {
 	unsigned pa_su_poly_offset_db_fmt_cntl;
 };
 
-struct r600_tiling_info {
-	unsigned num_channels;
-	unsigned num_banks;
-	unsigned group_bytes;
-};
-
 struct r600_common_screen {
 	struct pipe_screen		b;
 	struct radeon_winsys		*ws;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
 	struct radeon_info		info;
-	struct r600_tiling_info		tiling_info;
 	uint64_t			debug_flags;
 	bool				has_cp_dma;
 	bool				has_streamout;
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 0aa19cd54fe..f8b62411722 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -100,6 +100,12 @@ static boolean r600_query_sw_begin(struct r600_common_context *rctx,
 	case R600_QUERY_NUM_SHADERS_CREATED:
 		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
 		break;
+	case R600_QUERY_GPIN_ASIC_ID:
+	case R600_QUERY_GPIN_NUM_SIMD:
+	case R600_QUERY_GPIN_NUM_RB:
+	case R600_QUERY_GPIN_NUM_SPI:
+	case R600_QUERY_GPIN_NUM_SE:
+		break;
 	default:
 		unreachable("r600_query_sw_begin: bad query type");
 	}
@@ -146,6 +152,12 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
 	case R600_QUERY_NUM_SHADERS_CREATED:
 		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
 		break;
+	case R600_QUERY_GPIN_ASIC_ID:
+	case R600_QUERY_GPIN_NUM_SIMD:
+	case R600_QUERY_GPIN_NUM_RB:
+	case R600_QUERY_GPIN_NUM_SPI:
+	case R600_QUERY_GPIN_NUM_SE:
+		break;
 	default:
 		unreachable("r600_query_sw_end: bad query type");
 	}
@@ -162,7 +174,7 @@ static boolean r600_query_sw_get_result(struct r600_common_context *rctx,
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
 		/* Convert from cycles per millisecond to cycles per second (Hz). */
 		result->timestamp_disjoint.frequency =
-			(uint64_t)rctx->screen->info.r600_clock_crystal_freq * 1000;
+			(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
 		result->timestamp_disjoint.disjoint = FALSE;
 		return TRUE;
 	case PIPE_QUERY_GPU_FINISHED: {
@@ -171,6 +183,22 @@ static boolean r600_query_sw_get_result(struct r600_common_context *rctx,
 						 wait ? PIPE_TIMEOUT_INFINITE : 0);
 		return result->b;
 	}
+
+	case R600_QUERY_GPIN_ASIC_ID:
+		result->u32 = 0;
+		return TRUE;
+	case R600_QUERY_GPIN_NUM_SIMD:
+		result->u32 = rctx->screen->info.num_good_compute_units;
+		return TRUE;
+	case R600_QUERY_GPIN_NUM_RB:
+		result->u32 = rctx->screen->info.num_render_backends;
+		return TRUE;
+	case R600_QUERY_GPIN_NUM_SPI:
+		result->u32 = 1; /* all supported chips have one SPI per SE */
+		return TRUE;
+	case R600_QUERY_GPIN_NUM_SE:
+		result->u32 = rctx->screen->info.max_se;
+		return TRUE;
 	}
 
 	result->u64 = query->end_result - query->begin_result;
@@ -908,7 +936,7 @@ boolean r600_query_hw_get_result(struct r600_common_context *rctx,
 	/* Convert the time to expected units. */
 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
-		result->u64 = (1000000 * result->u64) / rctx->screen->info.r600_clock_crystal_freq;
+		result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq;
 	}
 	return TRUE;
 }
@@ -1021,13 +1049,13 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	struct r600_resource *buffer;
 	uint32_t *results;
-	unsigned num_backends = ctx->screen->info.r600_num_backends;
+	unsigned num_backends = ctx->screen->info.num_render_backends;
 	unsigned i, mask = 0;
 
 	/* if backend_map query is supported by the kernel */
-	if (ctx->screen->info.r600_backend_map_valid) {
-		unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
-		unsigned backend_map = ctx->screen->info.r600_backend_map;
+	if (ctx->screen->info.r600_gb_backend_map_valid) {
+		unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes;
+		unsigned backend_map = ctx->screen->info.r600_gb_backend_map;
 		unsigned item_width, item_mask;
 
 		if (ctx->chip_class >= EVERGREEN) {
@@ -1096,15 +1124,21 @@ err:
 	return;
 }
 
-#define X(name_, query_type_, type_, result_type_) \
+#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
 	{ \
 		.name = name_, \
 		.query_type = R600_QUERY_##query_type_, \
 		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
 		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
-		.group_id = ~(unsigned)0 \
+		.group_id = group_id_ \
 	}
 
+#define X(name_, query_type_, type_, result_type_) \
+	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+
+#define XG(group_, name_, query_type_, type_, result_type_) \
+	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
+
 static struct pipe_driver_query_info r600_driver_query_list[] = {
 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
@@ -1116,6 +1150,20 @@ static struct pipe_driver_query_info r600_driver_query_list[] = {
 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
+
+	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+	 * which use it as a fallback path to detect the GPU type.
+	 *
+	 * Note: The names of these queries are significant for GPUPerfStudio
+	 * (and possibly their order as well). */
+	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
+	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
+
+	/* The following queries must be at the end of the list because their
+	 * availability is adjusted dynamically based on the DRM version. */
 	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
 	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
 	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
@@ -1123,6 +1171,8 @@ static struct pipe_driver_query_info r600_driver_query_list[] = {
 };
 
 #undef X
+#undef XG
+#undef XFULL
 
 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
 {
@@ -1167,16 +1217,40 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 		break;
 	}
 
+	if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
+		info->group_id += rscreen->perfcounters->num_groups;
+
 	return 1;
 }
 
+/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
+ * performance counter groups, so be careful when changing this and related
+ * functions.
+ */
 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
 					    unsigned index,
 					    struct pipe_driver_query_group_info *info)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
+	unsigned num_pc_groups = 0;
 
-	return r600_get_perfcounter_group_info(rscreen, index, info);
+	if (rscreen->perfcounters)
+		num_pc_groups = rscreen->perfcounters->num_groups;
+
+	if (!info)
+		return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
+
+	if (index < num_pc_groups)
+		return r600_get_perfcounter_group_info(rscreen, index, info);
+
+	index -= num_pc_groups;
+	if (index >= R600_NUM_SW_QUERY_GROUPS)
+		return 0;
+
+	info->name = "GPIN";
+	info->max_active_queries = 5;
+	info->num_queries = 5;
+	return 1;
 }
 
 void r600_query_init(struct r600_common_context *rctx)
@@ -1189,7 +1263,7 @@ void r600_query_init(struct r600_common_context *rctx)
 	rctx->b.get_query_result = r600_get_query_result;
 	rctx->render_cond_atom.emit = r600_emit_query_predication;
 
-	if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0)
+	if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
 	    rctx->b.render_condition = r600_render_condition;
 
 	LIST_INITHEAD(&rctx->active_nontimer_queries);
diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h
index e5a98bfe5bd..8b2c4e3fe93 100644
--- a/src/gallium/drivers/radeon/r600_query.h
+++ b/src/gallium/drivers/radeon/r600_query.h
@@ -54,8 +54,18 @@ struct r600_resource;
 #define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
 #define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
 #define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define R600_QUERY_GPIN_ASIC_ID		(PIPE_QUERY_DRIVER_SPECIFIC + 14)
+#define R600_QUERY_GPIN_NUM_SIMD	(PIPE_QUERY_DRIVER_SPECIFIC + 15)
+#define R600_QUERY_GPIN_NUM_RB		(PIPE_QUERY_DRIVER_SPECIFIC + 16)
+#define R600_QUERY_GPIN_NUM_SPI		(PIPE_QUERY_DRIVER_SPECIFIC + 17)
+#define R600_QUERY_GPIN_NUM_SE		(PIPE_QUERY_DRIVER_SPECIFIC + 18)
 #define R600_QUERY_FIRST_PERFCOUNTER	(PIPE_QUERY_DRIVER_SPECIFIC + 100)
 
+enum {
+	R600_QUERY_GROUP_GPIN = 0,
+	R600_NUM_SW_QUERY_GROUPS
+};
+
 struct r600_query_ops {
 	void (*destroy)(struct r600_common_context *, struct r600_query *);
 	boolean (*begin)(struct r600_common_context *, struct r600_query *);
@@ -156,24 +166,6 @@ enum {
 	R600_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
 };
 
-/* Shader enable bits. Chosen to coincide with SQ_PERFCOUNTER_CTRL values */
-enum {
-	R600_PC_SHADER_PS = (1 << 0),
-	R600_PC_SHADER_VS = (1 << 1),
-	R600_PC_SHADER_GS = (1 << 2),
-	R600_PC_SHADER_ES = (1 << 3),
-	R600_PC_SHADER_HS = (1 << 4),
-	R600_PC_SHADER_LS = (1 << 5),
-	R600_PC_SHADER_CS = (1 << 6),
-
-	R600_PC_SHADER_ALL = R600_PC_SHADER_PS | R600_PC_SHADER_VS |
-			     R600_PC_SHADER_GS | R600_PC_SHADER_ES |
-			     R600_PC_SHADER_HS | R600_PC_SHADER_LS |
-			     R600_PC_SHADER_CS,
-
-	R600_PC_SHADER_WINDOWING = (1 << 31),
-};
-
 /* Describes a hardware block with performance counters. Multiple instances of
  * each block, possibly per-SE, may exist on the chip. Depending on the block
  * and on the user's configuration, we either
@@ -210,6 +202,10 @@ struct r600_perfcounters {
 	unsigned num_instance_cs_dwords;
 	unsigned num_shaders_cs_dwords;
 
+	unsigned num_shader_types;
+	const char * const *shader_type_suffixes;
+	const unsigned *shader_type_bits;
+
 	void (*get_size)(struct r600_perfcounter_block *,
 			 unsigned count, unsigned *selectors,
 			 unsigned *num_select_dw, unsigned *num_read_dw);
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 7c4717d29fa..af206e43860 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -361,8 +361,8 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
 	unsigned cmask_tile_elements = cmask_tile_width * cmask_tile_height;
 	unsigned element_bits = 4;
 	unsigned cmask_cache_bits = 1024;
-	unsigned num_pipes = rscreen->tiling_info.num_channels;
-	unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
+	unsigned num_pipes = rscreen->info.num_tile_pipes;
+	unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
 
 	unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes;
 	unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements;
@@ -394,8 +394,8 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
 				      struct r600_texture *rtex,
 				      struct r600_cmask_info *out)
 {
-	unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
-	unsigned num_pipes = rscreen->tiling_info.num_channels;
+	unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
+	unsigned num_pipes = rscreen->info.num_tile_pipes;
 	unsigned cl_width, cl_height;
 
 	switch (num_pipes) {
@@ -515,7 +515,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 {
 	unsigned cl_width, cl_height, width, height;
 	unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
-	unsigned num_pipes = rscreen->tiling_info.num_channels;
+	unsigned num_pipes = rscreen->info.num_tile_pipes;
 
 	if (rscreen->chip_class <= EVERGREEN &&
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
@@ -533,6 +533,10 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
+	/* Overalign HTILE on Stoney to fix piglit/depthstencil-render-miplevels 585. */
+	if (rscreen->family == CHIP_STONEY)
+		num_pipes = 4;
+
 	switch (num_pipes) {
 	case 1:
 		cl_width = 32;
@@ -565,7 +569,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	slice_elements = (width * height) / (8 * 8);
 	slice_bytes = slice_elements * 4;
 
-	pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
+	pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
 	base_align = num_pipes * pipe_interleave_bytes;
 
 	rtex->htile.pitch = width;
@@ -1212,10 +1216,30 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
 						const struct pipe_surface *templ)
 {
 	unsigned level = templ->u.tex.level;
+	unsigned width = u_minify(tex->width0, level);
+	unsigned height = u_minify(tex->height0, level);
 
-	return r600_create_surface_custom(pipe, tex, templ,
-					  u_minify(tex->width0, level),
-					  u_minify(tex->height0, level));
+	if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+		const struct util_format_description *tex_desc
+			= util_format_description(tex->format);
+		const struct util_format_description *templ_desc
+			= util_format_description(templ->format);
+
+		assert(tex_desc->block.bits == templ_desc->block.bits);
+
+		/* Adjust size of surface if and only if the block width or
+		 * height is changed. */
+		if (tex_desc->block.width != templ_desc->block.width ||
+		    tex_desc->block.height != templ_desc->block.height) {
+			unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+			unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+			width = nblks_x * templ_desc->block.width;
+			height = nblks_y * templ_desc->block.height;
+		}
+	}
+
+	return r600_create_surface_custom(pipe, tex, templ, width, height);
 }
 
 static void r600_surface_destroy(struct pipe_context *pipe,
@@ -1388,7 +1412,6 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		return;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
-		struct r600_surface *surf;
 		struct r600_texture *tex;
 		unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
 
@@ -1399,7 +1422,6 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		if (!(*buffers & clear_bit))
 			continue;
 
-		surf = (struct r600_surface *)fb->cbufs[i];
 		tex = (struct r600_texture *)fb->cbufs[i]->texture;
 
 		/* 128-bit formats are unusupported */
@@ -1446,8 +1468,8 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			if (clear_words_needed)
 				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
 		} else {
-			/* RB+ doesn't work with CMASK fast clear. */
-			if (surf->sx_ps_downconvert)
+			/* Stoney/RB+ doesn't work with CMASK fast clear. */
+			if (rctx->family == CHIP_STONEY)
 				continue;
 
 			/* ensure CMASK is enabled */
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 76be37625f3..f5e3f6af1a0 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1452,6 +1452,74 @@ static void emit_minmax_int(const struct lp_build_tgsi_action *action,
 				emit_data->args[1], "");
 }
 
+static void pk2h_fetch_args(struct lp_build_tgsi_context * bld_base,
+			    struct lp_build_emit_data * emit_data)
+{
+	emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+						 0, TGSI_CHAN_X);
+	emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+						 0, TGSI_CHAN_Y);
+}
+
+static void emit_pk2h(const struct lp_build_tgsi_action *action,
+		      struct lp_build_tgsi_context *bld_base,
+		      struct lp_build_emit_data *emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+	struct lp_build_context *uint_bld = &bld_base->uint_bld;
+	LLVMTypeRef fp16, i16;
+	LLVMValueRef const16, comp[2];
+	unsigned i;
+
+	fp16 = LLVMHalfTypeInContext(context);
+	i16 = LLVMInt16TypeInContext(context);
+	const16 = lp_build_const_int32(uint_bld->gallivm, 16);
+
+	for (i = 0; i < 2; i++) {
+		comp[i] = LLVMBuildFPTrunc(builder, emit_data->args[i], fp16, "");
+		comp[i] = LLVMBuildBitCast(builder, comp[i], i16, "");
+		comp[i] = LLVMBuildZExt(builder, comp[i], uint_bld->elem_type, "");
+	}
+
+	comp[1] = LLVMBuildShl(builder, comp[1], const16, "");
+	comp[0] = LLVMBuildOr(builder, comp[0], comp[1], "");
+
+	emit_data->output[emit_data->chan] = comp[0];
+}
+
+static void up2h_fetch_args(struct lp_build_tgsi_context * bld_base,
+			    struct lp_build_emit_data * emit_data)
+{
+	emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+						 0, TGSI_CHAN_X);
+}
+
+static void emit_up2h(const struct lp_build_tgsi_action *action,
+		      struct lp_build_tgsi_context *bld_base,
+		      struct lp_build_emit_data *emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+	struct lp_build_context *uint_bld = &bld_base->uint_bld;
+	LLVMTypeRef fp16, i16;
+	LLVMValueRef const16, input, val;
+	unsigned i;
+
+	fp16 = LLVMHalfTypeInContext(context);
+	i16 = LLVMInt16TypeInContext(context);
+	const16 = lp_build_const_int32(uint_bld->gallivm, 16);
+	input = emit_data->args[0];
+
+	for (i = 0; i < 2; i++) {
+		val = i == 1 ? LLVMBuildLShr(builder, input, const16, "") : input;
+		val = LLVMBuildTrunc(builder, val, i16, "");
+		val = LLVMBuildBitCast(builder, val, fp16, "");
+		emit_data->output[i] =
+			LLVMBuildFPExt(builder, val, bld_base->base.elem_type, "");
+	}
+}
+
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 {
 	struct lp_type type;
@@ -1581,6 +1649,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_UMSB].emit = emit_umsb;
 	bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not;
 	bld_base->op_actions[TGSI_OPCODE_OR].emit = emit_or;
+	bld_base->op_actions[TGSI_OPCODE_PK2H].fetch_args = pk2h_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_PK2H].emit = emit_pk2h;
 	bld_base->op_actions[TGSI_OPCODE_POPC].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_POPC].intr_name = "llvm.ctpop.i32";
 	bld_base->op_actions[TGSI_OPCODE_POW].emit = build_tgsi_intrinsic_nomem;
@@ -1618,6 +1688,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
 	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
 	bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;
+	bld_base->op_actions[TGSI_OPCODE_UP2H].fetch_args = up2h_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_UP2H].emit = emit_up2h;
 }
 
 void radeon_llvm_create_func(struct radeon_llvm_context * ctx,
@@ -1638,11 +1710,9 @@ void radeon_llvm_create_func(struct radeon_llvm_context * ctx,
 void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx)
 {
 	struct gallivm_state * gallivm = ctx->soa.bld_base.base.gallivm;
-	/* End the main function with Return*/
-	LLVMBuildRetVoid(gallivm->builder);
 
 	/* Create the pass manager */
-	ctx->gallivm.passmgr = LLVMCreateFunctionPassManagerForModule(
+	gallivm->passmgr = LLVMCreateFunctionPassManagerForModule(
 							gallivm->module);
 
 	/* This pass should eliminate all the load and store instructions */
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 2e5caa67d10..7329ceedf04 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -245,46 +245,49 @@ struct radeon_winsys_cs {
 };
 
 struct radeon_info {
+    /* Device info. */
     uint32_t                    pci_id;
     enum radeon_family          family;
     enum chip_class             chip_class;
     uint64_t                    gart_size;
     uint64_t                    vram_size;
-    uint32_t                    max_sclk;
-    uint32_t                    num_good_compute_units;
-    uint32_t                    max_se;
-    uint32_t                    max_sh_per_se;
+    boolean                     has_virtual_memory;
+    bool                        gfx_ib_pad_with_type2;
+    boolean                     has_sdma;
+    boolean                     has_uvd;
+    uint32_t                    vce_fw_version;
+    uint32_t                    vce_harvest_config;
+    uint32_t                    clock_crystal_freq;
 
+    /* Kernel info. */
     uint32_t                    drm_major; /* version */
     uint32_t                    drm_minor;
     uint32_t                    drm_patchlevel;
-
-    boolean                     has_uvd;
-    uint32_t                    vce_fw_version;
     boolean                     has_userptr;
-    bool                        gfx_ib_pad_with_type2;
 
+    /* Shader cores. */
+    uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
+    uint32_t                    max_shader_clock;
+    uint32_t                    num_good_compute_units;
+    uint32_t                    max_se; /* shader engines */
+    uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
+
+    /* Render backends (color + depth blocks). */
     uint32_t                    r300_num_gb_pipes;
     uint32_t                    r300_num_z_pipes;
+    uint32_t                    r600_gb_backend_map; /* R600 harvest config */
+    boolean                     r600_gb_backend_map_valid;
+    uint32_t                    r600_num_banks;
+    uint32_t                    num_render_backends;
+    uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
+    uint32_t                    pipe_interleave_bytes;
+    uint32_t                    enabled_rb_mask; /* GCN harvest config */
 
-    uint32_t                    r600_num_backends;
-    uint32_t                    r600_clock_crystal_freq;
-    uint32_t                    r600_tiling_config;
-    uint32_t                    r600_num_tile_pipes;
-    uint32_t                    r600_max_pipes;
-    boolean                     r600_virtual_address;
-    boolean                     r600_has_dma;
-
-    uint32_t                    r600_backend_map;
-    boolean                     r600_backend_map_valid;
-
+    /* Tile modes. */
     boolean                     si_tile_mode_array_valid;
     uint32_t                    si_tile_mode_array[32];
-    uint32_t                    si_backend_enabled_mask;
-
     boolean                     cik_macrotile_mode_array_valid;
     uint32_t                    cik_macrotile_mode_array[16];
-    uint32_t                    vce_harvest_config;
 };
 
 enum radeon_feature_id {
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 105a1b2a878..76913914b38 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -308,7 +308,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
 			}
 
 			mtilew = (8 * rsrc->surface.bankw *
-				  sctx->screen->b.tiling_info.num_channels) *
+				  sctx->screen->b.info.num_tile_pipes) *
 				rsrc->surface.mtilea;
 			assert(!(mtilew & (mtilew - 1)));
 			mtileh = (8 * rsrc->surface.bankh * num_banks) /
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 6ef6eeec178..825fbb181ba 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -461,9 +461,6 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){
 		LLVMContextDispose(program->llvm_ctx);
 	}
 #else
-	FREE(program->shader.binary.config);
-	FREE(program->shader.binary.rodata);
-	FREE(program->shader.binary.global_symbol_offsets);
 	si_shader_destroy(&program->shader);
 #endif
 
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index baa02293c41..d60c4515625 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -177,7 +177,7 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs);
 	si_mark_atom_dirty(ctx, &ctx->msaa_config);
 	si_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
-	si_mark_atom_dirty(ctx, &ctx->cb_target_mask);
+	si_mark_atom_dirty(ctx, &ctx->cb_render_state);
 	si_mark_atom_dirty(ctx, &ctx->blend_color.atom);
 	si_mark_atom_dirty(ctx, &ctx->db_render_state);
 	si_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 7ee1daee7bf..24855e4e6f2 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -56,6 +56,8 @@ enum si_pc_reg_layout {
 
 	/* Registers are laid out in decreasing rather than increasing order. */
 	SI_PC_REG_REVERSE = 4,
+
+	SI_PC_FAKE = 8,
 };
 
 struct si_pc_block_base {
@@ -79,6 +81,23 @@ struct si_pc_block {
 	unsigned instances;
 };
 
+/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
+ * performance counter group IDs.
+ */
+static const char * const si_pc_shader_type_suffixes[] = {
+	"", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
+};
+
+static const unsigned si_pc_shader_type_bits[] = {
+	0x7f,
+	S_036780_ES_EN(1),
+	S_036780_GS_EN(1),
+	S_036780_VS_EN(1),
+	S_036780_PS_EN(1),
+	S_036780_LS_EN(1),
+	S_036780_HS_EN(1),
+	S_036780_CS_EN(1),
+};
 
 static struct si_pc_block_base cik_CB = {
 	.name = "CB",
@@ -308,56 +327,80 @@ static struct si_pc_block_base cik_WD = {
 	.counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
 };
 
+static struct si_pc_block_base cik_MC = {
+	.name = "MC",
+	.num_counters = 4,
+
+	.layout = SI_PC_FAKE,
+};
+
+static struct si_pc_block_base cik_SRBM = {
+	.name = "SRBM",
+	.num_counters = 2,
+
+	.layout = SI_PC_FAKE,
+};
+
 /* Both the number of instances and selectors varies between chips of the same
  * class. We only differentiate by class here and simply expose the maximum
  * number over all chips in a class.
+ *
+ * Unfortunately, GPUPerfStudio uses the order of performance counter groups
+ * blindly once it believes it has identified the hardware, so the order of
+ * blocks here matters.
  */
 static struct si_pc_block groups_CIK[] = {
 	{ &cik_CB, 226, 4 },
-	{ &cik_CPC, 22 },
 	{ &cik_CPF, 17 },
-	{ &cik_CPG, 46 },
 	{ &cik_DB, 257, 4 },
-	{ &cik_GDS, 121 },
 	{ &cik_GRBM, 34 },
 	{ &cik_GRBMSE, 15 },
-	{ &cik_IA, 22 },
-	{ &cik_PA_SC, 395 },
 	{ &cik_PA_SU, 153 },
+	{ &cik_PA_SC, 395 },
 	{ &cik_SPI, 186 },
 	{ &cik_SQ, 252 },
 	{ &cik_SX, 32 },
 	{ &cik_TA, 111, 11 },
 	{ &cik_TCA, 39, 2 },
 	{ &cik_TCC, 160, 16 },
-	{ &cik_TCP, 154, 11 },
 	{ &cik_TD, 55, 11 },
+	{ &cik_TCP, 154, 11 },
+	{ &cik_GDS, 121 },
 	{ &cik_VGT, 140 },
+	{ &cik_IA, 22 },
+	{ &cik_MC, 22 },
+	{ &cik_SRBM, 19 },
 	{ &cik_WD, 22 },
+	{ &cik_CPG, 46 },
+	{ &cik_CPC, 22 },
+
 };
 
 static struct si_pc_block groups_VI[] = {
 	{ &cik_CB, 396, 4 },
-	{ &cik_CPC, 24 },
 	{ &cik_CPF, 19 },
-	{ &cik_CPG, 48 },
 	{ &cik_DB, 257, 4 },
-	{ &cik_GDS, 121 },
 	{ &cik_GRBM, 34 },
 	{ &cik_GRBMSE, 15 },
-	{ &cik_IA, 24 },
-	{ &cik_PA_SC, 397 },
 	{ &cik_PA_SU, 153 },
+	{ &cik_PA_SC, 397 },
 	{ &cik_SPI, 197 },
 	{ &cik_SQ, 273 },
 	{ &cik_SX, 34 },
 	{ &cik_TA, 119, 16 },
 	{ &cik_TCA, 35, 2 },
 	{ &cik_TCC, 192, 16 },
-	{ &cik_TCP, 180, 16 },
 	{ &cik_TD, 55, 16 },
+	{ &cik_TCP, 180, 16 },
+	{ &cik_GDS, 121 },
 	{ &cik_VGT, 147 },
+	{ &cik_IA, 24 },
+	{ &cik_MC, 22 },
+	{ &cik_SRBM, 27 },
 	{ &cik_WD, 37 },
+	{ &cik_CPG, 48 },
+	{ &cik_CPC, 24 },
+
 };
 
 static void si_pc_get_size(struct r600_perfcounter_block *group,
@@ -368,7 +411,9 @@ static void si_pc_get_size(struct r600_perfcounter_block *group,
 	struct si_pc_block_base *regs = sigroup->b;
 	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
 
-	if (layout_multi == SI_PC_MULTI_BLOCK) {
+	if (regs->layout & SI_PC_FAKE) {
+		*num_select_dw = 0;
+	} else if (layout_multi == SI_PC_MULTI_BLOCK) {
 		if (count < regs->num_multi)
 			*num_select_dw = 2 * (count + 2) + regs->num_prelude;
 		else
@@ -431,6 +476,9 @@ static void si_pc_emit_select(struct r600_common_context *ctx,
 
 	assert(count <= regs->num_counters);
 
+	if (regs->layout & SI_PC_FAKE)
+		return;
+
 	if (layout_multi == SI_PC_MULTI_BLOCK) {
 		assert(!(regs->layout & SI_PC_REG_REVERSE));
 
@@ -590,22 +638,35 @@ static void si_pc_emit_read(struct r600_common_context *ctx,
 	unsigned reg = regs->counter0_lo;
 	unsigned reg_delta = 8;
 
-	if (regs->layout & SI_PC_REG_REVERSE)
-		reg_delta = -reg_delta;
+	if (!(regs->layout & SI_PC_FAKE)) {
+		if (regs->layout & SI_PC_REG_REVERSE)
+			reg_delta = -reg_delta;
 
-	for (idx = 0; idx < count; ++idx) {
-		if (regs->counters)
-			reg = regs->counters[idx];
+		for (idx = 0; idx < count; ++idx) {
+			if (regs->counters)
+				reg = regs->counters[idx];
 
-		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-				COPY_DATA_DST_SEL(COPY_DATA_MEM));
-		radeon_emit(cs, reg >> 2);
-		radeon_emit(cs, 0); /* unused */
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		va += 4;
-		reg += reg_delta;
+			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
+					COPY_DATA_DST_SEL(COPY_DATA_MEM));
+			radeon_emit(cs, reg >> 2);
+			radeon_emit(cs, 0); /* unused */
+			radeon_emit(cs, va);
+			radeon_emit(cs, va >> 32);
+			va += 4;
+			reg += reg_delta;
+		}
+	} else {
+		for (idx = 0; idx < count; ++idx) {
+			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
+					COPY_DATA_DST_SEL(COPY_DATA_MEM));
+			radeon_emit(cs, 0); /* immediate */
+			radeon_emit(cs, 0); /* unused */
+			radeon_emit(cs, va);
+			radeon_emit(cs, va >> 32);
+			va += 4;
+		}
 	}
 }
 
@@ -656,6 +717,10 @@ void si_init_perfcounters(struct si_screen *screen)
 		pc->num_stop_cs_dwords += 6;
 	}
 
+	pc->num_shader_types = ARRAY_SIZE(si_pc_shader_type_bits);
+	pc->shader_type_suffixes = si_pc_shader_type_suffixes;
+	pc->shader_type_bits = si_pc_shader_type_bits;
+
 	pc->get_size = si_pc_get_size;
 	pc->emit_instance = si_pc_emit_instance;
 	pc->emit_shaders = si_pc_emit_shaders;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 0c1ae90f9da..61ce976c32c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -215,7 +215,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	r600_target = radeon_llvm_get_r600_target(triple);
 	sctx->tm = LLVMCreateTargetMachine(r600_target, triple,
 					   r600_get_llvm_processor_name(sscreen->b.family),
-					   "+DumpCode,+vgpr-spilling",
+#if HAVE_LLVM >= 0x0308
+					   sscreen->b.debug_flags & DBG_SI_SCHED ?
+					   	"+DumpCode,+vgpr-spilling,+si-scheduler" :
+#endif
+					   	"+DumpCode,+vgpr-spilling",
 					   LLVMCodeGenLevelDefault,
 					   LLVMRelocDefault,
 					   LLVMCodeModelDefault);
@@ -304,6 +308,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_INVALIDATE_BUFFER:
+	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+	case PIPE_CAP_QUERY_MEMORY_INFO:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -329,12 +335,18 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
 		return 4;
 
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+		return HAVE_LLVM >= 0x0306;
+
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
 		return HAVE_LLVM >= 0x0307 ? 410 : 330;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 		return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF);
 
+	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+		return 0;
+
 	/* Unsupported features. */
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
@@ -344,12 +356,12 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
-	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
+	case PIPE_CAP_QUERY_BUFFER_OBJECT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -399,7 +411,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	/* Timer queries, present when the clock frequency is non zero. */
 	case PIPE_CAP_QUERY_TIMESTAMP:
 	case PIPE_CAP_QUERY_TIME_ELAPSED:
-		return sscreen->b.info.r600_clock_crystal_freq != 0;
+		return sscreen->b.info.clock_crystal_freq != 0;
 
  	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
 	case PIPE_CAP_MIN_TEXEL_OFFSET:
@@ -541,57 +553,6 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
 	r600_destroy_common_screen(&sscreen->b);
 }
 
-#define SI_TILE_MODE_COLOR_2D_8BPP  14
-
-/* Initialize pipe config. This is especially important for GPUs
- * with 16 pipes and more where it's initialized incorrectly by
- * the TILING_CONFIG ioctl. */
-static bool si_initialize_pipe_config(struct si_screen *sscreen)
-{
-	unsigned mode2d;
-
-	/* This is okay, because there can be no 2D tiling without
-	 * the tile mode array, so we won't need the pipe config.
-	 * Return "success".
-	 */
-	if (!sscreen->b.info.si_tile_mode_array_valid)
-		return true;
-
-	/* The same index is used for the 2D mode on CIK too. */
-	mode2d = sscreen->b.info.si_tile_mode_array[SI_TILE_MODE_COLOR_2D_8BPP];
-
-	switch (G_009910_PIPE_CONFIG(mode2d)) {
-	case V_02803C_ADDR_SURF_P2:
-		sscreen->b.tiling_info.num_channels = 2;
-		break;
-	case V_02803C_X_ADDR_SURF_P4_8X16:
-	case V_02803C_X_ADDR_SURF_P4_16X16:
-	case V_02803C_X_ADDR_SURF_P4_16X32:
-	case V_02803C_X_ADDR_SURF_P4_32X32:
-		sscreen->b.tiling_info.num_channels = 4;
-		break;
-	case V_02803C_X_ADDR_SURF_P8_16X16_8X16:
-	case V_02803C_X_ADDR_SURF_P8_16X32_8X16:
-	case V_02803C_X_ADDR_SURF_P8_32X32_8X16:
-	case V_02803C_X_ADDR_SURF_P8_16X32_16X16:
-	case V_02803C_X_ADDR_SURF_P8_32X32_16X16:
-	case V_02803C_X_ADDR_SURF_P8_32X32_16X32:
-	case V_02803C_X_ADDR_SURF_P8_32X64_32X32:
-		sscreen->b.tiling_info.num_channels = 8;
-		break;
-	case V_02803C_X_ADDR_SURF_P16_32X32_8X16:
-	case V_02803C_X_ADDR_SURF_P16_32X32_16X16:
-		sscreen->b.tiling_info.num_channels = 16;
-		break;
-	default:
-		assert(0);
-		fprintf(stderr, "radeonsi: Unknown pipe config %i.\n",
-			G_009910_PIPE_CONFIG(mode2d));
-		return false;
-	}
-	return true;
-}
-
 static bool si_init_gs_info(struct si_screen *sscreen)
 {
 	switch (sscreen->b.family) {
@@ -636,7 +597,6 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 	sscreen->b.b.resource_create = r600_resource_create_common;
 
 	if (!r600_common_screen_init(&sscreen->b, ws) ||
-	    !si_initialize_pipe_config(sscreen) ||
 	    !si_init_gs_info(sscreen)) {
 		FREE(sscreen);
 		return NULL;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index e2725fe3679..48947442757 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -193,7 +193,7 @@ struct si_context {
 	struct r600_atom		db_render_state;
 	struct r600_atom		msaa_config;
 	struct si_sample_mask		sample_mask;
-	struct r600_atom		cb_target_mask;
+	struct r600_atom		cb_render_state;
 	struct si_blend_color		blend_color;
 	struct r600_atom		clip_regs;
 	struct si_clip_state		clip_state;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 94c1129c88d..d9ed6b234e0 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4074,7 +4074,7 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 			si_shader_dump_disassembly(&shader->binary, debug);
 
 	si_shader_dump_stats(sscreen, &shader->config,
-                            shader->selector->info.num_inputs,
+			     shader->selector ? shader->selector->info.num_inputs : 0,
 			     shader->binary.code_size, debug, processor);
 }
 
@@ -4092,7 +4092,7 @@ int si_compile_llvm(struct si_screen *sscreen,
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
-		if (!(sscreen->b.debug_flags & DBG_NO_IR))
+		if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR)))
 			LLVMDumpModule(mod);
 	}
 
@@ -4177,6 +4177,13 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
 
+	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+
+	/* Dump LLVM IR before any optimization passes */
+	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
+	    r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
+		LLVMDumpModule(bld_base->base.gallivm->module);
+
 	radeon_llvm_finalize_module(&si_shader_ctx->radeon_bld);
 
 	if (dump)
@@ -4383,9 +4390,16 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		goto out;
 	}
 
+	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
+	mod = bld_base->base.gallivm->module;
+
+	/* Dump LLVM IR before any optimization passes */
+	if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
+	    r600_can_dump_shader(&sscreen->b, si_shader_ctx.type))
+		LLVMDumpModule(mod);
+
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
-	mod = bld_base->base.gallivm->module;
 	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
 			    mod, debug, si_shader_ctx.type);
 	if (r) {
@@ -4423,14 +4437,6 @@ out:
 	return r;
 }
 
-void si_shader_destroy_binary(struct radeon_shader_binary *binary)
-{
-	FREE(binary->code);
-	FREE(binary->rodata);
-	FREE(binary->relocs);
-	FREE(binary->disasm_string);
-}
-
 void si_shader_destroy(struct si_shader *shader)
 {
 	if (shader->gs_copy_shader) {
@@ -4442,5 +4448,6 @@ void si_shader_destroy(struct si_shader *shader)
 		r600_resource_reference(&shader->scratch_bo, NULL);
 
 	r600_resource_reference(&shader->bo, NULL);
-	si_shader_destroy_binary(&shader->binary);
+
+	radeon_shader_binary_clean(&shader->binary);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index c1512078a18..98bdb890a45 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -345,7 +345,6 @@ int si_compile_llvm(struct si_screen *sscreen,
 		    struct pipe_debug_callback *debug,
 		    unsigned processor);
 void si_shader_destroy(struct si_shader *shader);
-void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 9e0ccfc5dde..bf780777b50 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -97,7 +97,7 @@ uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex)
 	}
 
 	/* The old way. */
-	switch (sscreen->b.tiling_info.num_banks) {
+	switch (sscreen->b.info.r600_num_banks) {
 	case 2:
 		return V_02803C_ADDR_SURF_2_BANK;
 	case 4:
@@ -189,14 +189,14 @@ unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode)
 
 	/* This is probably broken for a lot of chips, but it's only used
 	 * if the kernel cannot return the tile mode array for CIK. */
-	switch (sscreen->b.info.r600_num_tile_pipes) {
+	switch (sscreen->b.info.num_tile_pipes) {
 	case 16:
 		return V_02803C_X_ADDR_SURF_P16_32X32_16X16;
 	case 8:
 		return V_02803C_X_ADDR_SURF_P8_32X32_16X16;
 	case 4:
 	default:
-		if (sscreen->b.info.r600_num_backends == 4)
+		if (sscreen->b.info.num_render_backends == 4)
 			return V_02803C_X_ADDR_SURF_P4_16X16;
 		else
 			return V_02803C_X_ADDR_SURF_P4_8X16;
@@ -238,7 +238,8 @@ static unsigned si_pack_float_12p4(float x)
 /*
  * Inferred framebuffer and blender state.
  *
- * One of the reasons this must be derived from the framebuffer state is that:
+ * One of the reasons CB_TARGET_MASK must be derived from the framebuffer state
+ * is that:
  * - The blend state mask is 0xf most of the time.
  * - The COLOR1 format isn't INVALID because of possible dual-source blending,
  *   so COLOR1 is enabled pretty much all the time.
@@ -246,18 +247,18 @@ static unsigned si_pack_float_12p4(float x)
  *
  * Another reason is to avoid a hang with dual source blending.
  */
-static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_blend *blend = sctx->queued.named.blend;
-	uint32_t mask = 0, i;
+	uint32_t cb_target_mask = 0, i;
 
 	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++)
 		if (sctx->framebuffer.state.cbufs[i])
-			mask |= 0xf << (4*i);
+			cb_target_mask |= 0xf << (4*i);
 
 	if (blend)
-		mask &= blend->cb_target_mask;
+		cb_target_mask &= blend->cb_target_mask;
 
 	/* Avoid a hang that happens when dual source blending is enabled
 	 * but there is not enough color outputs. This is undefined behavior,
@@ -268,9 +269,146 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	if (blend && blend->dual_src_blend &&
 	    sctx->ps_shader.cso &&
 	    (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
-		mask = 0;
+		cb_target_mask = 0;
 
-	radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, mask);
+	radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, cb_target_mask);
+
+	/* STONEY-specific register settings. */
+	if (sctx->b.family == CHIP_STONEY) {
+		unsigned spi_shader_col_format =
+			sctx->ps_shader.cso ?
+			sctx->ps_shader.current->key.ps.spi_shader_col_format : 0;
+		unsigned sx_ps_downconvert = 0;
+		unsigned sx_blend_opt_epsilon = 0;
+		unsigned sx_blend_opt_control = 0;
+
+		for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+			struct r600_surface *surf =
+				(struct r600_surface*)sctx->framebuffer.state.cbufs[i];
+			unsigned format, swap, spi_format, colormask;
+			bool has_alpha, has_rgb;
+
+			if (!surf)
+				continue;
+
+			format = G_028C70_FORMAT(surf->cb_color_info);
+			swap = G_028C70_COMP_SWAP(surf->cb_color_info);
+			spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
+			colormask = (cb_target_mask >> (i * 4)) & 0xf;
+
+			/* Set if RGB and A are present. */
+			has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
+
+			if (format == V_028C70_COLOR_8 ||
+			    format == V_028C70_COLOR_16 ||
+			    format == V_028C70_COLOR_32)
+				has_rgb = !has_alpha;
+			else
+				has_rgb = true;
+
+			/* Check the colormask and export format. */
+			if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
+				has_rgb = false;
+			if (!(colormask & PIPE_MASK_A))
+				has_alpha = false;
+
+			if (spi_format == V_028714_SPI_SHADER_ZERO) {
+				has_rgb = false;
+				has_alpha = false;
+			}
+
+			/* Disable value checking for disabled channels. */
+			if (!has_rgb)
+				sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
+			if (!has_alpha)
+				sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
+
+			/* Enable down-conversion for 32bpp and smaller formats. */
+			switch (format) {
+			case V_028C70_COLOR_8:
+			case V_028C70_COLOR_8_8:
+			case V_028C70_COLOR_8_8_8_8:
+				/* For 1 and 2-channel formats, use the superset thereof. */
+				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
+				    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+				    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
+					sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
+				}
+				break;
+
+			case V_028C70_COLOR_5_6_5:
+				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
+					sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
+				}
+				break;
+
+			case V_028C70_COLOR_1_5_5_5:
+				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
+					sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
+				}
+				break;
+
+			case V_028C70_COLOR_4_4_4_4:
+				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
+					sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
+				}
+				break;
+
+			case V_028C70_COLOR_32:
+				if (swap == V_0280A0_SWAP_STD &&
+				    spi_format == V_028714_SPI_SHADER_32_R)
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+				else if (swap == V_0280A0_SWAP_ALT_REV &&
+					 spi_format == V_028714_SPI_SHADER_32_AR)
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
+				break;
+
+			case V_028C70_COLOR_16:
+			case V_028C70_COLOR_16_16:
+				/* For 1-channel formats, use the superset thereof. */
+				if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
+				    spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
+				    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+				    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+					if (swap == V_0280A0_SWAP_STD ||
+					    swap == V_0280A0_SWAP_STD_REV)
+						sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
+					else
+						sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
+				}
+				break;
+
+			case V_028C70_COLOR_10_11_11:
+				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
+					sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4);
+				}
+				break;
+
+			case V_028C70_COLOR_2_10_10_10:
+				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
+					sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
+				}
+				break;
+			}
+		}
+
+		if (sctx->screen->b.debug_flags & DBG_NO_RB_PLUS) {
+			sx_ps_downconvert = 0;
+			sx_blend_opt_epsilon = 0;
+			sx_blend_opt_control = 0;
+		}
+
+		radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 3);
+		radeon_emit(cs, sx_ps_downconvert);	/* R_028754_SX_PS_DOWNCONVERT */
+		radeon_emit(cs, sx_blend_opt_epsilon);	/* R_028758_SX_BLEND_OPT_EPSILON */
+		radeon_emit(cs, sx_blend_opt_control);	/* R_02875C_SX_BLEND_OPT_CONTROL */
+	}
 }
 
 /*
@@ -390,6 +528,36 @@ static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
 	}
 }
 
+/**
+ * Get rid of DST in the blend factors by commuting the operands:
+ *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+ */
+static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
+				unsigned *dst_factor, unsigned expected_dst,
+				unsigned replacement_src)
+{
+	if (*src_factor == expected_dst &&
+	    *dst_factor == PIPE_BLENDFACTOR_ZERO) {
+		*src_factor = PIPE_BLENDFACTOR_ZERO;
+		*dst_factor = replacement_src;
+
+		/* Commuting the operands requires reversing subtractions. */
+		if (*func == PIPE_BLEND_SUBTRACT)
+			*func = PIPE_BLEND_REVERSE_SUBTRACT;
+		else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
+			*func = PIPE_BLEND_SUBTRACT;
+	}
+}
+
+static bool si_blend_factor_uses_dst(unsigned factor)
+{
+	return factor == PIPE_BLENDFACTOR_DST_COLOR ||
+		factor == PIPE_BLENDFACTOR_DST_ALPHA ||
+		factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+		factor == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+		factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
+}
+
 static void *si_create_blend_state_mode(struct pipe_context *ctx,
 					const struct pipe_blend_state *state,
 					unsigned mode)
@@ -397,7 +565,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
 	struct si_pm4_state *pm4 = &blend->pm4;
-
+	uint32_t sx_mrt_blend_opt[8] = {0};
 	uint32_t color_control = 0;
 
 	if (!blend)
@@ -435,12 +603,17 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 		unsigned srcA = state->rt[j].alpha_src_factor;
 		unsigned dstA = state->rt[j].alpha_dst_factor;
 
+		unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
 		unsigned blend_cntl = 0;
 
+		sx_mrt_blend_opt[i] =
+			S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
+			S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
+
 		if (!state->rt[j].colormask)
 			continue;
 
-		/* we pretend 8 buffer are used, CB_SHADER_MASK will disable unused one */
+		/* cb_render_state will disable unused ones */
 		blend->cb_target_mask |= state->rt[j].colormask << (4 * i);
 
 		if (!state->rt[j].blend_enable) {
@@ -448,6 +621,50 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 			continue;
 		}
 
+		/* Blending optimizations for Stoney.
+		 * These transformations don't change the behavior.
+		 *
+		 * First, get rid of DST in the blend factors:
+		 *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+		 */
+		si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
+				    PIPE_BLENDFACTOR_DST_COLOR,
+				    PIPE_BLENDFACTOR_SRC_COLOR);
+		si_blend_remove_dst(&eqA, &srcA, &dstA,
+				    PIPE_BLENDFACTOR_DST_COLOR,
+				    PIPE_BLENDFACTOR_SRC_COLOR);
+		si_blend_remove_dst(&eqA, &srcA, &dstA,
+				    PIPE_BLENDFACTOR_DST_ALPHA,
+				    PIPE_BLENDFACTOR_SRC_ALPHA);
+
+		/* Look up the ideal settings from tables. */
+		srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
+		dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
+		srcA_opt = si_translate_blend_opt_factor(srcA, true);
+		dstA_opt = si_translate_blend_opt_factor(dstA, true);
+
+		/* Handle interdependencies. */
+		if (si_blend_factor_uses_dst(srcRGB))
+			dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+		if (si_blend_factor_uses_dst(srcA))
+			dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+
+		if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
+		    (dstRGB == PIPE_BLENDFACTOR_ZERO ||
+		     dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+		     dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
+			dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+
+		/* Set the final value. */
+		sx_mrt_blend_opt[i] =
+			S_028760_COLOR_SRC_OPT(srcRGB_opt) |
+			S_028760_COLOR_DST_OPT(dstRGB_opt) |
+			S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
+			S_028760_ALPHA_SRC_OPT(srcA_opt) |
+			S_028760_ALPHA_DST_OPT(dstA_opt) |
+			S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
+
+		/* Set blend state. */
 		blend_cntl |= S_028780_ENABLE(1);
 		blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
 		blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
@@ -480,41 +697,13 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 	}
 
 	if (sctx->b.family == CHIP_STONEY) {
-		uint32_t sx_blend_opt_control = 0;
-
-		for (int i = 0; i < 8; i++) {
-			const int j = state->independent_blend_enable ? i : 0;
-
-			/* TODO: We can also set this if the surface doesn't contain RGB. */
-			if (!state->rt[j].blend_enable ||
-			    !(state->rt[j].colormask & (PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B)))
-				sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (4 * i);
-
-			/* TODO: We can also set this if the surface doesn't contain alpha. */
-			if (!state->rt[j].blend_enable ||
-			    !(state->rt[j].colormask & PIPE_MASK_A))
-				sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (4 * i);
-
-			if (!state->rt[j].blend_enable) {
-				si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
-					       S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
-					       S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED));
-				continue;
-			}
-
+		for (int i = 0; i < 8; i++)
 			si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
-				S_028760_COLOR_SRC_OPT(si_translate_blend_opt_factor(state->rt[j].rgb_src_factor, false)) |
-				S_028760_COLOR_DST_OPT(si_translate_blend_opt_factor(state->rt[j].rgb_dst_factor, false)) |
-				S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(state->rt[j].rgb_func)) |
-				S_028760_ALPHA_SRC_OPT(si_translate_blend_opt_factor(state->rt[j].alpha_src_factor, true)) |
-				S_028760_ALPHA_DST_OPT(si_translate_blend_opt_factor(state->rt[j].alpha_dst_factor, true)) |
-				S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(state->rt[j].alpha_func)));
-		}
+				       sx_mrt_blend_opt[i]);
 
-		si_pm4_set_reg(pm4, R_02875C_SX_BLEND_OPT_CONTROL, sx_blend_opt_control);
-
-		/* RB+ doesn't work with dual source blending */
-		if (blend->dual_src_blend)
+		/* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
+		if (blend->dual_src_blend || state->logicop_enable ||
+		    mode == V_028808_CB_RESOLVE)
 			color_control |= S_028808_DISABLE_DUAL_QUAD(1);
 	}
 
@@ -532,7 +721,7 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	si_pm4_bind_state(sctx, blend, (struct si_state_blend *)state);
-	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
+	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -2097,8 +2286,10 @@ static void si_initialize_color_surface(struct si_context *sctx,
 
 	color_pitch = S_028C64_TILE_MAX(pitch);
 
+	/* Intensity is implemented as Red, so treat it that way. */
 	color_attrib = S_028C74_TILE_MODE_INDEX(tile_mode_index) |
-		S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1);
+		S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1 ||
+					   util_format_is_intensity(surf->base.format));
 
 	if (rtex->resource.b.b.nr_samples > 1) {
 		unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
@@ -2169,61 +2360,6 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	/* Determine pixel shader export format */
 	si_choose_spi_color_formats(surf, format, swap, ntype, rtex->is_depth);
 
-	if (sctx->b.family == CHIP_STONEY &&
-	    !(sctx->screen->b.debug_flags & DBG_NO_RB_PLUS)) {
-		switch (desc->channel[0].size) {
-		case 32:
-			if (desc->nr_channels == 1) {
-				if (swap == V_0280A0_SWAP_STD)
-					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
-				else if (swap == V_0280A0_SWAP_ALT_REV)
-					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_A;
-			}
-			break;
-		case 16:
-			/* For 1-channel formats, use the superset thereof. */
-			if (desc->nr_channels <= 2) {
-				if (swap == V_0280A0_SWAP_STD ||
-				    swap == V_0280A0_SWAP_STD_REV)
-					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_16_16_GR;
-				else
-					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_16_16_AR;
-			}
-			break;
-		case 11:
-			if (desc->nr_channels == 3) {
-				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_10_11_11;
-				surf->sx_blend_opt_epsilon = V_028758_11BIT_FORMAT;
-			}
-			break;
-		case 10:
-			if (desc->nr_channels == 4) {
-				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_2_10_10_10;
-				surf->sx_blend_opt_epsilon = V_028758_10BIT_FORMAT;
-			}
-			break;
-		case 8:
-			/* For 1 and 2-channel formats, use the superset thereof. */
-			surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_8_8_8_8;
-			surf->sx_blend_opt_epsilon = V_028758_8BIT_FORMAT;
-			break;
-		case 5:
-			if (desc->nr_channels == 3) {
-				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_5_6_5;
-				surf->sx_blend_opt_epsilon = V_028758_6BIT_FORMAT;
-			} else if (desc->nr_channels == 4) {
-				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_1_5_5_5;
-				surf->sx_blend_opt_epsilon = V_028758_5BIT_FORMAT;
-			}
-			break;
-		case 4:
-			/* For 1 nad 2-channel formats, use the superset thereof. */
-			surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_4_4_4_4;
-			surf->sx_blend_opt_epsilon = V_028758_4BIT_FORMAT;
-			break;
-		}
-	}
-
 	surf->color_initialized = true;
 }
 
@@ -2459,7 +2595,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	}
 
 	si_update_poly_offset_state(sctx);
-	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
+	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 
 	if (sctx->framebuffer.nr_samples != old_nr_samples) {
@@ -2512,8 +2648,6 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 	unsigned i, nr_cbufs = state->nr_cbufs;
 	struct r600_texture *tex = NULL;
 	struct r600_surface *cb = NULL;
-	uint32_t sx_ps_downconvert = 0;
-	uint32_t sx_blend_opt_epsilon = 0;
 
 	/* Colorbuffers. */
 	for (i = 0; i < nr_cbufs; i++) {
@@ -2564,29 +2698,18 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 
 		if (sctx->b.chip_class >= VI)
 			radeon_emit(cs, cb->cb_dcc_base);	/* R_028C94_CB_COLOR0_DCC_BASE */
-
-		sx_ps_downconvert |= cb->sx_ps_downconvert << (4 * i);
-		sx_blend_opt_epsilon |= cb->sx_blend_opt_epsilon << (4 * i);
 	}
 	/* set CB_COLOR1_INFO for possible dual-src blending */
 	if (i == 1 && state->cbufs[0] &&
 	    sctx->framebuffer.dirty_cbufs & (1 << 0)) {
 		radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + 1 * 0x3C,
 				       cb->cb_color_info | tex->cb_color_info);
-		sx_ps_downconvert |= cb->sx_ps_downconvert << (4 * i);
-		sx_blend_opt_epsilon |= cb->sx_blend_opt_epsilon << (4 * i);
 		i++;
 	}
 	for (; i < 8 ; i++)
 		if (sctx->framebuffer.dirty_cbufs & (1 << i))
 			radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
 
-	if (sctx->b.family == CHIP_STONEY) {
-		radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 2);
-		radeon_emit(cs, sx_ps_downconvert);	/* R_028754_SX_PS_DOWNCONVERT */
-		radeon_emit(cs, sx_blend_opt_epsilon);	/* R_028758_SX_BLEND_OPT_EPSILON */
-	}
-
 	/* ZS buffer. */
 	if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
@@ -3374,7 +3497,7 @@ void si_init_state_functions(struct si_context *sctx)
 	si_init_atom(sctx, &sctx->db_render_state, &sctx->atoms.s.db_render_state, si_emit_db_render_state);
 	si_init_atom(sctx, &sctx->msaa_config, &sctx->atoms.s.msaa_config, si_emit_msaa_config);
 	si_init_atom(sctx, &sctx->sample_mask.atom, &sctx->atoms.s.sample_mask, si_emit_sample_mask);
-	si_init_atom(sctx, &sctx->cb_target_mask, &sctx->atoms.s.cb_target_mask, si_emit_cb_target_mask);
+	si_init_atom(sctx, &sctx->cb_render_state, &sctx->atoms.s.cb_render_state, si_emit_cb_render_state);
 	si_init_atom(sctx, &sctx->blend_color.atom, &sctx->atoms.s.blend_color, si_emit_blend_color);
 	si_init_atom(sctx, &sctx->clip_regs, &sctx->atoms.s.clip_regs, si_emit_clip_regs);
 	si_init_atom(sctx, &sctx->clip_state.atom, &sctx->atoms.s.clip_state, si_emit_clip_state);
@@ -3449,8 +3572,8 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 {
 	unsigned sh_per_se = MAX2(sctx->screen->b.info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(sctx->screen->b.info.max_se, 1);
-	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_mask = sctx->screen->b.info.enabled_rb_mask;
+	unsigned num_rb = MIN2(sctx->screen->b.info.num_render_backends, 16);
 	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
 	unsigned rb_per_se = num_rb / num_se;
 	unsigned se_mask[4];
@@ -3579,8 +3702,8 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 static void si_init_config(struct si_context *sctx)
 {
 	struct si_screen *sscreen = sctx->screen;
-	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
-	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
+	unsigned num_rb = MIN2(sctx->screen->b.info.num_render_backends, 16);
+	unsigned rb_mask = sctx->screen->b.info.enabled_rb_mask;
 	unsigned raster_config, raster_config_1;
 	uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index be3488e6dba..507f45938ce 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -124,7 +124,7 @@ union si_state_atoms {
 		struct r600_atom *db_render_state;
 		struct r600_atom *msaa_config;
 		struct r600_atom *sample_mask;
-		struct r600_atom *cb_target_mask;
+		struct r600_atom *cb_render_state;
 		struct r600_atom *blend_color;
 		struct r600_atom *clip_regs;
 		struct r600_atom *clip_state;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 36174eb5a94..bbef429edc5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -705,23 +705,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 }
 
 /* Select the hw shader variant depending on the current state. */
-static int si_shader_select(struct pipe_context *ctx,
-			    struct si_shader_ctx_state *state)
+static int si_shader_select_with_key(struct pipe_context *ctx,
+				     struct si_shader_ctx_state *state,
+				     union si_shader_key *key)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state->cso;
 	struct si_shader *current = state->current;
-	union si_shader_key key;
 	struct si_shader *iter, *shader = NULL;
 	int r;
 
-	si_shader_selector_key(ctx, sel, &key);
-
 	/* Check if we don't need to change anything.
 	 * This path is also used for most shaders that don't need multiple
 	 * variants, it will cost just a computation of the key and this
 	 * test. */
-	if (likely(current && memcmp(&current->key, &key, sizeof(key)) == 0))
+	if (likely(current && memcmp(&current->key, key, sizeof(*key)) == 0))
 		return 0;
 
 	pipe_mutex_lock(sel->mutex);
@@ -730,7 +728,7 @@ static int si_shader_select(struct pipe_context *ctx,
 	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
 		/* Don't check the "current" shader. We checked it above. */
 		if (current != iter &&
-		    memcmp(&iter->key, &key, sizeof(key)) == 0) {
+		    memcmp(&iter->key, key, sizeof(*key)) == 0) {
 			state->current = iter;
 			pipe_mutex_unlock(sel->mutex);
 			return 0;
@@ -744,7 +742,7 @@ static int si_shader_select(struct pipe_context *ctx,
 		return -ENOMEM;
 	}
 	shader->selector = sel;
-	shader->key = key;
+	shader->key = *key;
 
 	r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug);
 	if (unlikely(r)) {
@@ -768,6 +766,15 @@ static int si_shader_select(struct pipe_context *ctx,
 	return 0;
 }
 
+static int si_shader_select(struct pipe_context *ctx,
+			    struct si_shader_ctx_state *state)
+{
+	union si_shader_key key;
+
+	si_shader_selector_key(ctx, state->cso, &key);
+	return si_shader_select_with_key(ctx, state, &key);
+}
+
 static void *si_create_shader_selector(struct pipe_context *ctx,
 				       const struct pipe_shader_state *state)
 {
@@ -888,8 +895,27 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	/* Pre-compilation. */
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
 		struct si_shader_ctx_state state = {sel};
+		union si_shader_key key;
 
-		if (si_shader_select(ctx, &state)) {
+		memset(&key, 0, sizeof(key));
+
+		/* Set reasonable defaults, so that the shader key doesn't
+		 * cause any code to be eliminated.
+		 */
+		switch (sel->type) {
+		case PIPE_SHADER_TESS_CTRL:
+			key.tcs.prim_mode = PIPE_PRIM_TRIANGLES;
+			break;
+		case PIPE_SHADER_FRAGMENT:
+			key.ps.alpha_func = PIPE_FUNC_ALWAYS;
+			for (i = 0; i < 8; i++)
+				if (sel->info.colors_written & (1 << i))
+					key.ps.spi_shader_col_format |=
+						V_028710_SPI_SHADER_FP16_ABGR << (i * 4);
+			break;
+		}
+
+		if (si_shader_select_with_key(ctx, &state, &key)) {
 			fprintf(stderr, "radeonsi: can't create a shader\n");
 			tgsi_free_tokens(sel->tokens);
 			FREE(sel);
@@ -1001,7 +1027,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 
 	sctx->ps_shader.cso = sel;
 	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
-	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
+	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 }
 
 static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
@@ -1726,6 +1752,9 @@ bool si_update_shaders(struct si_context *sctx)
 			si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
 		}
 
+		if (sctx->b.family == CHIP_STONEY && si_pm4_state_changed(sctx, ps))
+			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+
 		if (sctx->ps_db_shader_control != db_shader_control) {
 			sctx->ps_db_shader_control = db_shader_control;
 			si_mark_atom_dirty(sctx, &sctx->db_render_state);
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 3bc580899d4..097ffe6f920 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -179,6 +179,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
       return 1;
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+      return 0;
    case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
       return 65536;
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
@@ -261,6 +263,9 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 8d04222a0cd..d5405f8eacf 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -358,6 +358,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
@@ -396,6 +398,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
       return 0;
    }
 
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 6e703f76499..4d03fe1ee0b 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -1578,6 +1578,45 @@ static void trace_context_set_tess_state(struct pipe_context *_context,
 }
 
 
+static void trace_context_set_shader_buffers(struct pipe_context *_context,
+                                             unsigned shader,
+                                             unsigned start, unsigned nr,
+                                             struct pipe_shader_buffer *buffers)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_shader_buffer *_buffers = NULL;
+
+   trace_dump_call_begin("pipe_context", "set_shader_buffers");
+   trace_dump_arg(ptr, context);
+   trace_dump_arg(uint, shader);
+   trace_dump_arg(uint, start);
+   trace_dump_arg_begin("buffers");
+   trace_dump_struct_array(shader_buffer, buffers, nr);
+   trace_dump_arg_end();
+   trace_dump_call_end();
+
+   if (buffers) {
+      int i;
+
+      _buffers = MALLOC(nr * sizeof(struct pipe_shader_buffer));
+      if (!_buffers)
+         return;
+
+      for (i = 0; i < nr; i++) {
+         _buffers[i] = buffers[i];
+         _buffers[i].buffer = trace_resource_unwrap(
+            tr_context, _buffers[i].buffer);
+      }
+   }
+
+   context->set_shader_buffers(context, shader, start, nr, _buffers);
+
+   if (_buffers)
+      FREE(_buffers);
+}
+
+
 static const struct debug_named_value rbug_blocker_flags[] = {
    {"before", 1, NULL},
    {"after", 2, NULL},
@@ -1675,6 +1714,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(texture_barrier);
    TR_CTX_INIT(memory_barrier);
    TR_CTX_INIT(set_tess_state);
+   TR_CTX_INIT(set_shader_buffers);
 
    TR_CTX_INIT(transfer_map);
    TR_CTX_INIT(transfer_unmap);
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 54f022a8ab6..cfbf53cf767 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -688,6 +688,24 @@ void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
 }
 
 
+void trace_dump_shader_buffer(const struct pipe_shader_buffer *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_shader_buffer");
+   trace_dump_member(resource_ptr, state, buffer);
+   trace_dump_member(uint, state, buffer_offset);
+   trace_dump_member(uint, state, buffer_size);
+   trace_dump_struct_end();
+}
+
+
 void trace_dump_draw_info(const struct pipe_draw_info *state)
 {
    if (!trace_dumping_enabled_locked())
diff --git a/src/gallium/drivers/trace/tr_dump_state.h b/src/gallium/drivers/trace/tr_dump_state.h
index 117b3c75e87..4f4ade155bc 100644
--- a/src/gallium/drivers/trace/tr_dump_state.h
+++ b/src/gallium/drivers/trace/tr_dump_state.h
@@ -78,6 +78,8 @@ void trace_dump_vertex_element(const struct pipe_vertex_element *state);
 
 void trace_dump_constant_buffer(const struct pipe_constant_buffer *state);
 
+void trace_dump_shader_buffer(const struct pipe_shader_buffer *buffer);
+
 void trace_dump_draw_info(const struct pipe_draw_info *state);
 
 void trace_dump_blit_info(const struct pipe_blit_info *);
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 5d071ec862f..41660f6ac4d 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -245,10 +245,19 @@ vc4_job_submit(struct vc4_context *vc4)
                         fprintf(stderr, "Draw call returned %s.  "
                                         "Expect corruption.\n", strerror(errno));
                         warned = true;
+                } else if (!ret) {
+                        vc4->last_emit_seqno = submit.seqno;
                 }
         }
 
-        vc4->last_emit_seqno = submit.seqno;
+        if (vc4->last_emit_seqno - vc4->screen->finished_seqno > 5) {
+                if (!vc4_wait_seqno(vc4->screen,
+                                    vc4->last_emit_seqno - 5,
+                                    PIPE_TIMEOUT_INFINITE,
+                                    "job throttling")) {
+                        fprintf(stderr, "Job throttling failed\n");
+                }
+        }
 
         if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
                 if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno,
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 08c2dad8406..b19d31af6ac 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -127,6 +127,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                 /* Unsupported features. */
         case PIPE_CAP_ANISOTROPIC_FILTER:
         case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+        case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
         case PIPE_CAP_CUBE_MAP_ARRAY:
         case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
         case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
@@ -199,6 +200,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_INVALIDATE_BUFFER:
         case PIPE_CAP_GENERATE_MIPMAP:
         case PIPE_CAP_STRING_MARKER:
+        case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+        case PIPE_CAP_QUERY_BUFFER_OBJECT:
+	case PIPE_CAP_QUERY_MEMORY_INFO:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index fb2e5670ef0..18263e91e6a 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -169,6 +169,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return vscreen->caps.caps.v1.max_tbo_size > 0;
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
       return 0;
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+      return 0;
    case PIPE_CAP_CUBE_MAP_ARRAY:
       return vscreen->caps.caps.v1.bset.cube_map_array;
    case PIPE_CAP_TEXTURE_MULTISAMPLE:
@@ -228,6 +230,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
@@ -557,6 +561,7 @@ virgl_create_screen(struct virgl_winsys *vws)
 
    vws->get_caps(vws, &screen->caps);
 
+   screen->refcnt = 1;
 
    util_format_s3tc_init();
    return &screen->base;
diff --git a/src/gallium/drivers/virgl/virgl_screen.h b/src/gallium/drivers/virgl/virgl_screen.h
index 52e72ca4958..8cac38d7e96 100644
--- a/src/gallium/drivers/virgl/virgl_screen.h
+++ b/src/gallium/drivers/virgl/virgl_screen.h
@@ -28,6 +28,12 @@
 
 struct virgl_screen {
    struct pipe_screen base;
+
+   int refcnt;
+
+   /* place for winsys to stash it's own stuff: */
+   void *winsys_priv;
+
    struct virgl_winsys *vws;
 
    struct virgl_drm_caps caps;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index f69a75be50e..6c95b7b2178 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -150,6 +150,28 @@ struct pipe_context {
                                struct pipe_query *q,
                                boolean wait,
                                union pipe_query_result *result);
+
+   /**
+    * Get results of a query, storing into resource. Note that this may not
+    * be used with batch queries.
+    *
+    * \param wait  if true, this query will block until the result is ready
+    * \param result_type  the type of the value being stored:
+    * \param index  for queries that return multiple pieces of data, which
+    *               item of that data to store (e.g. for
+    *               PIPE_QUERY_PIPELINE_STATISTICS).
+    *               When the index is -1, instead of the value of the query
+    *               the driver should instead write a 1/0 to the appropriate
+    *               location with 1 meaning that the query result is available.
+    */
+   void (*get_query_result_resource)(struct pipe_context *pipe,
+                                     struct pipe_query *q,
+                                     boolean wait,
+                                     enum pipe_query_value_type result_type,
+                                     int index,
+                                     struct pipe_resource *resource,
+                                     unsigned offset);
+
    /*@}*/
 
    /**
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b46187bc8a1..800f16cd250 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -352,6 +352,8 @@ enum pipe_flush_flags
  * Flags for pipe_context::memory_barrier.
  */
 #define PIPE_BARRIER_MAPPED_BUFFER     (1 << 0)
+#define PIPE_BARRIER_SHADER_BUFFER     (1 << 1)
+#define PIPE_BARRIER_QUERY_BUFFER      (1 << 2)
 
 /**
  * Resource binding flags -- state tracker must specify in advance all
@@ -375,6 +377,7 @@ enum pipe_flush_flags
 #define PIPE_BIND_SHADER_IMAGE         (1 << 15) /* set_shader_images */
 #define PIPE_BIND_COMPUTE_RESOURCE     (1 << 16) /* set_compute_resources */
 #define PIPE_BIND_COMMAND_ARGS_BUFFER  (1 << 17) /* pipe_draw_info.indirect */
+#define PIPE_BIND_QUERY_BUFFER         (1 << 18) /* get_query_result_resource */
 
 /**
  * The first two flags above were previously part of the amorphous
@@ -588,6 +591,7 @@ enum pipe_cap
    PIPE_CAP_CUBE_MAP_ARRAY,
    PIPE_CAP_TEXTURE_BUFFER_OBJECTS,
    PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT,
+   PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY,
    PIPE_CAP_TGSI_TEXCOORD,
    PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER,
    PIPE_CAP_QUERY_PIPELINE_STATISTICS,
@@ -645,6 +649,9 @@ enum pipe_cap
    PIPE_CAP_INVALIDATE_BUFFER,
    PIPE_CAP_GENERATE_MIPMAP,
    PIPE_CAP_STRING_MARKER,
+   PIPE_CAP_SURFACE_REINTERPRET_BLOCKS,
+   PIPE_CAP_QUERY_BUFFER_OBJECT,
+   PIPE_CAP_QUERY_MEMORY_INFO,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -837,6 +844,14 @@ union pipe_query_result
    union pipe_numeric_type_union batch[1];
 };
 
+enum pipe_query_value_type
+{
+   PIPE_QUERY_TYPE_I32,
+   PIPE_QUERY_TYPE_U32,
+   PIPE_QUERY_TYPE_I64,
+   PIPE_QUERY_TYPE_U64,
+};
+
 union pipe_color_union
 {
    float f[4];
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index f868d71db23..211bc2440f9 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -57,6 +57,7 @@ struct pipe_resource;
 struct pipe_surface;
 struct pipe_transfer;
 struct pipe_box;
+struct pipe_memory_info;
 
 
 /**
@@ -260,6 +261,11 @@ struct pipe_screen {
                                       unsigned index,
                                       struct pipe_driver_query_group_info *info);
 
+   /**
+    * Query information about memory usage.
+    */
+   void (*query_memory_info)(struct pipe_screen *screen,
+                             struct pipe_memory_info *info);
 };
 
 
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index f300207d4dd..6539017b77c 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -420,7 +420,7 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_FSLT                110
 #define TGSI_OPCODE_FSNE                111
 
-                                /* gap */
+#define TGSI_OPCODE_MEMBAR              112
 #define TGSI_OPCODE_CALLNZ              113
                                 /* gap */
 #define TGSI_OPCODE_BREAKC              115
@@ -744,6 +744,11 @@ struct tgsi_instruction_memory
    unsigned Padding   : 29;
 };
 
+#define TGSI_MEMBAR_SHADER_BUFFER (1 << 0)
+#define TGSI_MEMBAR_ATOMIC_BUFFER (1 << 1)
+#define TGSI_MEMBAR_SHADER_IMAGE  (1 << 2)
+#define TGSI_MEMBAR_SHARED        (1 << 3)
+#define TGSI_MEMBAR_THREAD_GROUP  (1 << 4)
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 2e4d2830199..ed62a33ad72 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -720,6 +720,19 @@ struct pipe_debug_callback
    void *data;
 };
 
+/**
+ * Information about memory usage. All sizes are in kilobytes.
+ */
+struct pipe_memory_info
+{
+   unsigned total_device_memory; /**< size of device memory, e.g. VRAM */
+   unsigned avail_device_memory; /**< free device memory at the moment */
+   unsigned total_staging_memory; /**< size of staging memory, e.g. GART */
+   unsigned avail_staging_memory; /**< free staging memory at the moment */
+   unsigned device_memory_evicted; /**< size of memory evicted (monotonic counter) */
+   unsigned nr_device_memory_evictions; /**< # of evictions (monotonic counter) */
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/state_trackers/nine/Makefile.sources b/src/gallium/state_trackers/nine/Makefile.sources
index 99b623a5b59..8d178d4b18f 100644
--- a/src/gallium/state_trackers/nine/Makefile.sources
+++ b/src/gallium/state_trackers/nine/Makefile.sources
@@ -5,6 +5,8 @@ C_SOURCES := \
 	authenticatedchannel9.h \
 	basetexture9.c \
 	basetexture9.h \
+	buffer9.c \
+	buffer9.h \
 	cryptosession9.c \
 	cryptosession9.h \
 	cubetexture9.c \
diff --git a/src/gallium/state_trackers/nine/adapter9.c b/src/gallium/state_trackers/nine/adapter9.c
index 69e0fa25961..8428b1bd7eb 100644
--- a/src/gallium/state_trackers/nine/adapter9.c
+++ b/src/gallium/state_trackers/nine/adapter9.c
@@ -563,7 +563,7 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
                                D3DPIPECAP(INDEP_BLEND_ENABLE, D3DPMISCCAPS_INDEPENDENTWRITEMASKS) |
                                /*D3DPMISCCAPS_PERSTAGECONSTANT |*/ /* TODO */
                                /*D3DPMISCCAPS_POSTBLENDSRGBCONVERT |*/ /* TODO */
-                               D3DPMISCCAPS_FOGANDSPECULARALPHA |
+                               D3DPMISCCAPS_FOGANDSPECULARALPHA | /* Note: documentation of the flag is wrong */
                                D3DPIPECAP(BLEND_EQUATION_SEPARATE, D3DPMISCCAPS_SEPARATEALPHABLEND) |
                                D3DPIPECAP(MIXED_COLORBUFFER_FORMATS, D3DPMISCCAPS_MRTINDEPENDENTBITDEPTHS) |
                                D3DPMISCCAPS_MRTPOSTPIXELSHADERBLENDING |
@@ -618,7 +618,8 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
 
     pCaps->DestBlendCaps = pCaps->SrcBlendCaps;
 
-    pCaps->AlphaCmpCaps = D3DPCMPCAPS_LESS |
+    pCaps->AlphaCmpCaps = D3DPCMPCAPS_NEVER |
+                          D3DPCMPCAPS_LESS |
                           D3DPCMPCAPS_EQUAL |
                           D3DPCMPCAPS_LESSEQUAL |
                           D3DPCMPCAPS_GREATER |
@@ -980,7 +981,8 @@ NineAdapter9_CreateDevice( struct NineAdapter9 *This,
 
     hr = NineDevice9_new(screen, &params, &caps, pPresentationParameters,
                          pD3D9, pPresentationGroup, This->ctx, FALSE, NULL,
-                         (struct NineDevice9 **)ppReturnedDeviceInterface);
+                         (struct NineDevice9 **)ppReturnedDeviceInterface,
+                         minor);
     if (FAILED(hr)) {
         DBG("Failed to create device.\n");
         return hr;
@@ -1041,7 +1043,8 @@ NineAdapter9_CreateDeviceEx( struct NineAdapter9 *This,
     hr = NineDevice9Ex_new(screen, &params, &caps, pPresentationParameters,
                            pFullscreenDisplayMode,
                            pD3D9Ex, pPresentationGroup, This->ctx,
-                           (struct NineDevice9Ex **)ppReturnedDeviceInterface);
+                           (struct NineDevice9Ex **)ppReturnedDeviceInterface,
+                           minor);
     if (FAILED(hr)) {
         DBG("Failed to create device.\n");
         return hr;
diff --git a/src/gallium/state_trackers/nine/basetexture9.c b/src/gallium/state_trackers/nine/basetexture9.c
index d13138b7d5c..7a0959a8f3e 100644
--- a/src/gallium/state_trackers/nine/basetexture9.c
+++ b/src/gallium/state_trackers/nine/basetexture9.c
@@ -319,7 +319,7 @@ NineBaseTexture9_UploadSelf( struct NineBaseTexture9 *This )
 
             if (tex->dirty_box.width) {
                 for (l = min_level_dirty; l <= last_level; ++l) {
-                    u_box_minify_2d(&box, &tex->dirty_box, l);
+                    u_box_minify_3d(&box, &tex->dirty_box, l);
                     NineVolume9_UploadSelf(tex->volumes[l], &box);
                 }
                 memset(&tex->dirty_box, 0, sizeof(tex->dirty_box));
diff --git a/src/gallium/state_trackers/nine/buffer9.c b/src/gallium/state_trackers/nine/buffer9.c
new file mode 100644
index 00000000000..b4b91ec2a02
--- /dev/null
+++ b/src/gallium/state_trackers/nine/buffer9.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2011 Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2015 Patrick Rudolph <siro@das-labor.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "buffer9.h"
+#include "device9.h"
+#include "nine_helpers.h"
+#include "nine_pipe.h"
+
+#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "util/u_box.h"
+
+#define DBG_CHANNEL (DBG_INDEXBUFFER|DBG_VERTEXBUFFER)
+
+HRESULT
+NineBuffer9_ctor( struct NineBuffer9 *This,
+                        struct NineUnknownParams *pParams,
+                        D3DRESOURCETYPE Type,
+                        DWORD Usage,
+                        UINT Size,
+                        D3DPOOL Pool )
+{
+    struct pipe_resource *info = &This->base.info;
+    HRESULT hr;
+
+    DBG("This=%p Size=0x%x Usage=%x Pool=%u\n", This, Size, Usage, Pool);
+
+    user_assert(Pool != D3DPOOL_SCRATCH, D3DERR_INVALIDCALL);
+
+    This->maps = MALLOC(sizeof(struct pipe_transfer *));
+    if (!This->maps)
+        return E_OUTOFMEMORY;
+    This->nmaps = 0;
+    This->maxmaps = 1;
+    This->size = Size;
+
+    This->pipe = pParams->device->pipe;
+
+    info->screen = pParams->device->screen;
+    info->target = PIPE_BUFFER;
+    info->format = PIPE_FORMAT_R8_UNORM;
+    info->width0 = Size;
+    info->flags = 0;
+
+    info->bind = PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_TRANSFER_WRITE;
+    if (!(Usage & D3DUSAGE_WRITEONLY))
+        info->bind |= PIPE_BIND_TRANSFER_READ;
+
+    info->usage = PIPE_USAGE_DEFAULT;
+    if (Usage & D3DUSAGE_DYNAMIC)
+        info->usage = PIPE_USAGE_STREAM;
+    else if (Pool == D3DPOOL_SYSTEMMEM)
+        info->usage = PIPE_USAGE_STAGING;
+
+    /* if (pDesc->Usage & D3DUSAGE_DONOTCLIP) { } */
+    /* if (pDesc->Usage & D3DUSAGE_NONSECURE) { } */
+    /* if (pDesc->Usage & D3DUSAGE_NPATCHES) { } */
+    /* if (pDesc->Usage & D3DUSAGE_POINTS) { } */
+    /* if (pDesc->Usage & D3DUSAGE_RTPATCHES) { } */
+    if (Usage & D3DUSAGE_SOFTWAREPROCESSING)
+        DBG("Application asked for Software Vertex Processing, "
+            "but this is unimplemented\n");
+    /* if (pDesc->Usage & D3DUSAGE_TEXTAPI) { } */
+
+    info->height0 = 1;
+    info->depth0 = 1;
+    info->array_size = 1;
+    info->last_level = 0;
+    info->nr_samples = 0;
+
+    hr = NineResource9_ctor(&This->base, pParams, NULL, TRUE,
+                            Type, Pool, Usage);
+    return hr;
+}
+
+void
+NineBuffer9_dtor( struct NineBuffer9 *This )
+{
+    if (This->maps) {
+        while (This->nmaps) {
+            NineBuffer9_Unlock(This);
+        }
+        FREE(This->maps);
+    }
+
+    NineResource9_dtor(&This->base);
+}
+
+struct pipe_resource *
+NineBuffer9_GetResource( struct NineBuffer9 *This )
+{
+    return NineResource9_GetResource(&This->base);
+}
+
+HRESULT WINAPI
+NineBuffer9_Lock( struct NineBuffer9 *This,
+                        UINT OffsetToLock,
+                        UINT SizeToLock,
+                        void **ppbData,
+                        DWORD Flags )
+{
+    struct pipe_box box;
+    void *data;
+    unsigned usage = d3dlock_buffer_to_pipe_transfer_usage(Flags);
+
+    DBG("This=%p(pipe=%p) OffsetToLock=0x%x, SizeToLock=0x%x, Flags=0x%x\n",
+        This, This->base.resource,
+        OffsetToLock, SizeToLock, Flags);
+
+    user_assert(ppbData, E_POINTER);
+    user_assert(!(Flags & ~(D3DLOCK_DISCARD |
+                            D3DLOCK_DONOTWAIT |
+                            D3DLOCK_NO_DIRTY_UPDATE |
+                            D3DLOCK_NOSYSLOCK |
+                            D3DLOCK_READONLY |
+                            D3DLOCK_NOOVERWRITE)), D3DERR_INVALIDCALL);
+
+    if (This->nmaps == This->maxmaps) {
+        struct pipe_transfer **newmaps =
+            REALLOC(This->maps, sizeof(struct pipe_transfer *)*This->maxmaps,
+                    sizeof(struct pipe_transfer *)*(This->maxmaps << 1));
+        if (newmaps == NULL)
+            return E_OUTOFMEMORY;
+
+        This->maxmaps <<= 1;
+        This->maps = newmaps;
+    }
+
+    if (SizeToLock == 0) {
+        SizeToLock = This->size - OffsetToLock;
+        user_warn(OffsetToLock != 0);
+    }
+
+    u_box_1d(OffsetToLock, SizeToLock, &box);
+
+    data = This->pipe->transfer_map(This->pipe, This->base.resource, 0,
+                                    usage, &box, &This->maps[This->nmaps]);
+
+    if (!data) {
+        DBG("pipe::transfer_map failed\n"
+            " usage = %x\n"
+            " box.x = %u\n"
+            " box.width = %u\n",
+            usage, box.x, box.width);
+        /* not sure what to return, msdn suggests this */
+        if (Flags & D3DLOCK_DONOTWAIT)
+            return D3DERR_WASSTILLDRAWING;
+        return D3DERR_INVALIDCALL;
+    }
+
+    DBG("returning pointer %p\n", data);
+    This->nmaps++;
+    *ppbData = data;
+
+    return D3D_OK;
+}
+
+HRESULT WINAPI
+NineBuffer9_Unlock( struct NineBuffer9 *This )
+{
+    DBG("This=%p\n", This);
+
+    user_assert(This->nmaps > 0, D3DERR_INVALIDCALL);
+    This->pipe->transfer_unmap(This->pipe, This->maps[--(This->nmaps)]);
+    return D3D_OK;
+}
diff --git a/src/gallium/state_trackers/nine/buffer9.h b/src/gallium/state_trackers/nine/buffer9.h
new file mode 100644
index 00000000000..1afd9a996ea
--- /dev/null
+++ b/src/gallium/state_trackers/nine/buffer9.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2011 Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2015 Patrick Rudolph <siro@das-labor.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef _NINE_BUFFER9_H_
+#define _NINE_BUFFER9_H_
+
+#include "resource9.h"
+
+struct pipe_screen;
+struct pipe_context;
+struct pipe_transfer;
+
+struct NineBuffer9
+{
+    struct NineResource9 base;
+
+    /* G3D */
+    struct pipe_context *pipe;
+    struct pipe_transfer **maps;
+    int nmaps, maxmaps;
+    UINT size;
+};
+static inline struct NineBuffer9 *
+NineBuffer9( void *data )
+{
+    return (struct NineBuffer9 *)data;
+}
+
+HRESULT
+NineBuffer9_ctor( struct NineBuffer9 *This,
+                        struct NineUnknownParams *pParams,
+                        D3DRESOURCETYPE Type,
+                        DWORD Usage,
+                        UINT Size,
+                        D3DPOOL Pool );
+
+void
+NineBuffer9_dtor( struct NineBuffer9 *This );
+
+struct pipe_resource *
+NineBuffer9_GetResource( struct NineBuffer9 *This );
+
+HRESULT WINAPI
+NineBuffer9_Lock( struct NineBuffer9 *This,
+                        UINT OffsetToLock,
+                        UINT SizeToLock,
+                        void **ppbData,
+                        DWORD Flags );
+
+HRESULT WINAPI
+NineBuffer9_Unlock( struct NineBuffer9 *This );
+
+#endif /* _NINE_BUFFER9_H_ */
diff --git a/src/gallium/state_trackers/nine/cubetexture9.c b/src/gallium/state_trackers/nine/cubetexture9.c
index abba2637946..460cc853942 100644
--- a/src/gallium/state_trackers/nine/cubetexture9.c
+++ b/src/gallium/state_trackers/nine/cubetexture9.c
@@ -181,7 +181,7 @@ NineCubeTexture9_dtor( struct NineCubeTexture9 *This )
     }
 
     if (This->managed_buffer)
-        FREE(This->managed_buffer);
+        align_free(This->managed_buffer);
 
     NineBaseTexture9_dtor(&This->base);
 }
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 0be83658928..475ef96788e 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -38,6 +38,7 @@
 #include "nine_pipe.h"
 #include "nine_ff.h"
 #include "nine_dump.h"
+#include "nine_limits.h"
 
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
@@ -81,7 +82,7 @@ static void nine_setup_fpu(void)
 
 #endif
 
-static void
+void
 NineDevice9_SetDefaultState( struct NineDevice9 *This, boolean is_reset )
 {
     struct NineSurface9 *refSurf = NULL;
@@ -112,8 +113,10 @@ NineDevice9_SetDefaultState( struct NineDevice9 *This, boolean is_reset )
         This->state.scissor.maxy = refSurf->desc.Height;
     }
 
-    if (This->nswapchains && This->swapchains[0]->params.EnableAutoDepthStencil)
+    if (This->nswapchains && This->swapchains[0]->params.EnableAutoDepthStencil) {
         This->state.rs[D3DRS_ZENABLE] = TRUE;
+        This->state.rs_advertised[D3DRS_ZENABLE] = TRUE;
+    }
     if (This->state.rs[D3DRS_ZENABLE])
         NineDevice9_SetDepthStencilSurface(
             This, (IDirect3DSurface9 *)This->swapchains[0]->zsbuf);
@@ -131,7 +134,8 @@ NineDevice9_ctor( struct NineDevice9 *This,
                   ID3DPresentGroup *pPresentationGroup,
                   struct d3dadapter9_context *pCTX,
                   boolean ex,
-                  D3DDISPLAYMODEEX *pFullscreenDisplayMode )
+                  D3DDISPLAYMODEEX *pFullscreenDisplayMode,
+                  int minorVersionNum )
 {
     unsigned i;
     HRESULT hr = NineUnknown_ctor(&This->base, pParams);
@@ -152,6 +156,8 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->params = *pCreationParameters;
     This->ex = ex;
     This->present = pPresentationGroup;
+    This->minor_version_num = minorVersionNum;
+
     IDirect3D9_AddRef(This->d3d9);
     ID3DPresentGroup_AddRef(This->present);
 
@@ -172,6 +178,19 @@ NineDevice9_ctor( struct NineDevice9 *This,
     /* Create first, it messes up our state. */
     This->hud = hud_create(This->pipe, This->cso); /* NULL result is fine */
 
+    /* Available memory counter. Updated only for allocations with this device
+     * instance. This is the Win 7 behavior.
+     * Win XP shares this counter across multiple devices. */
+    This->available_texture_mem = This->screen->get_param(This->screen, PIPE_CAP_VIDEO_MEMORY);
+    if (This->available_texture_mem < 4096)
+        This->available_texture_mem <<= 20;
+    else
+        This->available_texture_mem = UINT_MAX;
+    /* We cap texture memory usage to 80% of what is reported free initially
+     * This helps get closer Win behaviour. For example VertexBuffer allocation
+     * still succeeds when texture allocation fails. */
+    This->available_texture_limit = This->available_texture_mem * 20LL / 100LL;
+
     /* create implicit swapchains */
     This->nswapchains = ID3DPresentGroup_GetMultiheadCount(This->present);
     This->swapchains = CALLOC(This->nswapchains,
@@ -460,7 +479,8 @@ NineDevice9_dtor( struct NineDevice9 *This )
 
     if (This->swapchains) {
         for (i = 0; i < This->nswapchains; ++i)
-            NineUnknown_Unbind(NineUnknown(This->swapchains[i]));
+            if (This->swapchains[i])
+                NineUnknown_Unbind(NineUnknown(This->swapchains[i]));
         FREE(This->swapchains);
     }
 
@@ -523,17 +543,20 @@ NineDevice9_ResumeRecording( struct NineDevice9 *This )
 HRESULT WINAPI
 NineDevice9_TestCooperativeLevel( struct NineDevice9 *This )
 {
-    return D3D_OK; /* TODO */
+    if (NineSwapChain9_GetOccluded(This->swapchains[0])) {
+        This->device_needs_reset = TRUE;
+        return D3DERR_DEVICELOST;
+    } else if (This->device_needs_reset) {
+        return D3DERR_DEVICENOTRESET;
+    }
+
+    return D3D_OK;
 }
 
 UINT WINAPI
 NineDevice9_GetAvailableTextureMem( struct NineDevice9 *This )
 {
-   const unsigned mem = This->screen->get_param(This->screen, PIPE_CAP_VIDEO_MEMORY);
-   if (mem < 4096)
-      return mem << 20;
-   else
-      return UINT_MAX;
+    return This->available_texture_mem;
 }
 
 HRESULT WINAPI
@@ -606,6 +629,7 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
              "pCursorBitmap=%p\n", This, XHotSpot, YHotSpot, pCursorBitmap);
 
     user_assert(pCursorBitmap, D3DERR_INVALIDCALL);
+    user_assert(surf->desc.Format == D3DFMT_A8R8G8B8, D3DERR_INVALIDCALL);
 
     if (This->swapchains[0]->params.Windowed) {
         This->cursor.w = MIN2(surf->desc.Width, 32);
@@ -709,6 +733,11 @@ NineDevice9_CreateAdditionalSwapChain( struct NineDevice9 *This,
         This, pPresentationParameters, pSwapChain);
 
     user_assert(pPresentationParameters, D3DERR_INVALIDCALL);
+    user_assert(tmplt->params.Windowed && pPresentationParameters->Windowed, D3DERR_INVALIDCALL);
+
+    /* TODO: this deserves more tests */
+    if (!pPresentationParameters->hDeviceWindow)
+        pPresentationParameters->hDeviceWindow = This->params.hFocusWindow;
 
     hr = ID3DPresentGroup_CreateAdditionalPresent(This->present, pPresentationParameters, &present);
 
@@ -757,11 +786,16 @@ NineDevice9_Reset( struct NineDevice9 *This,
 
     DBG("This=%p pPresentationParameters=%p\n", This, pPresentationParameters);
 
+    if (NineSwapChain9_GetOccluded(This->swapchains[0])) {
+        This->device_needs_reset = TRUE;
+        return D3DERR_DEVICELOST;
+    }
+
     for (i = 0; i < This->nswapchains; ++i) {
         D3DPRESENT_PARAMETERS *params = &pPresentationParameters[i];
         hr = NineSwapChain9_Resize(This->swapchains[i], params, NULL);
         if (hr != D3D_OK)
-            return hr;
+            break;
     }
 
     nine_pipe_context_clear(This);
@@ -772,6 +806,7 @@ NineDevice9_Reset( struct NineDevice9 *This,
         This, 0, (IDirect3DSurface9 *)This->swapchains[0]->buffers[0]);
     /* XXX: better use GetBackBuffer here ? */
 
+    This->device_needs_reset = (hr != D3D_OK);
     return hr;
 }
 
@@ -806,6 +841,8 @@ NineDevice9_GetBackBuffer( struct NineDevice9 *This,
                            IDirect3DSurface9 **ppBackBuffer )
 {
     user_assert(ppBackBuffer != NULL, D3DERR_INVALIDCALL);
+    /* return NULL on error */
+    *ppBackBuffer = NULL;
     user_assert(iSwapChain < This->nswapchains, D3DERR_INVALIDCALL);
 
     return NineSwapChain9_GetBackBuffer(This->swapchains[iSwapChain],
@@ -1455,7 +1492,7 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
     struct NineSurface9 *src = NineSurface9(pSourceSurface);
     struct pipe_resource *dst_res = NineSurface9_GetResource(dst);
     struct pipe_resource *src_res = NineSurface9_GetResource(src);
-    const boolean zs = util_format_is_depth_or_stencil(dst_res->format);
+    boolean zs;
     struct pipe_blit_info blit;
     boolean scaled, clamped, ms, flip_x = FALSE, flip_y = FALSE;
 
@@ -1470,6 +1507,9 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
         DBG("pDestRect=(%u,%u)-(%u,%u)\n", pDestRect->left, pDestRect->top,
             pDestRect->right, pDestRect->bottom);
 
+    user_assert(dst->base.pool == D3DPOOL_DEFAULT &&
+                src->base.pool == D3DPOOL_DEFAULT, D3DERR_INVALIDCALL);
+    zs = util_format_is_depth_or_stencil(dst_res->format);
     user_assert(!zs || !This->in_scene, D3DERR_INVALIDCALL);
     user_assert(!zs || !pSourceRect ||
                 (pSourceRect->left == 0 &&
@@ -1493,8 +1533,6 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
                                             src_res->nr_samples,
                                             PIPE_BIND_SAMPLER_VIEW),
                 D3DERR_INVALIDCALL);
-    user_assert(dst->base.pool == D3DPOOL_DEFAULT &&
-                src->base.pool == D3DPOOL_DEFAULT, D3DERR_INVALIDCALL);
 
     /* We might want to permit these, but wine thinks we shouldn't. */
     user_assert(!pDestRect ||
@@ -1668,6 +1706,8 @@ NineDevice9_ColorFill( struct NineDevice9 *This,
     user_assert((surf->base.usage & D3DUSAGE_RENDERTARGET) ||
                 NineSurface9_IsOffscreenPlain(surf), D3DERR_INVALIDCALL);
 
+    user_assert(surf->desc.Format != D3DFMT_NULL, D3D_OK);
+
     if (pRect) {
         x = pRect->left;
         y = pRect->top;
@@ -1884,15 +1924,18 @@ NineDevice9_Clear( struct NineDevice9 *This,
         Count = 0;
 #endif
 
+    nine_update_state_framebuffer_clear(This);
+
     if (Flags & D3DCLEAR_TARGET) bufs |= PIPE_CLEAR_COLOR;
-    if (Flags & D3DCLEAR_ZBUFFER) bufs |= PIPE_CLEAR_DEPTH;
-    if (Flags & D3DCLEAR_STENCIL) bufs |= PIPE_CLEAR_STENCIL;
+    /* Ignore Z buffer if not bound */
+    if (This->state.fb.zsbuf != NULL) {
+        if (Flags & D3DCLEAR_ZBUFFER) bufs |= PIPE_CLEAR_DEPTH;
+        if (Flags & D3DCLEAR_STENCIL) bufs |= PIPE_CLEAR_STENCIL;
+    }
     if (!bufs)
         return D3D_OK;
     d3dcolor_to_pipe_color_union(&rgba, Color);
 
-    nine_update_state_framebuffer(This);
-
     rect.x1 = This->state.viewport.X;
     rect.y1 = This->state.viewport.Y;
     rect.x2 = This->state.viewport.Width + rect.x1;
@@ -1935,7 +1978,6 @@ NineDevice9_Clear( struct NineDevice9 *This,
         /* Case we clear depth buffer (and eventually rt too).
          * depth buffer size is always >= rt size. Compare to clear region */
         ((bufs & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) &&
-         This->state.fb.zsbuf != NULL &&
          rect.x2 >= zsbuf_surf->desc.Width &&
          rect.y2 >= zsbuf_surf->desc.Height))) {
         DBG("Clear fast path\n");
@@ -2342,8 +2384,15 @@ NineDevice9_SetRenderState( struct NineDevice9 *This,
     DBG("This=%p State=%u(%s) Value=%08x\n", This,
         State, nine_d3drs_to_string(State), Value);
 
+    user_assert(State < D3DRS_COUNT, D3DERR_INVALIDCALL);
+
+    if (state->rs_advertised[State] == Value && likely(!This->is_recording))
+        return D3D_OK;
+
+    state->rs_advertised[State] = Value;
+
     /* Amd hacks (equivalent to GL extensions) */
-    if (State == D3DRS_POINTSIZE) {
+    if (unlikely(State == D3DRS_POINTSIZE)) {
         if (Value == RESZ_CODE)
             return NineDevice9_ResolveZ(This);
 
@@ -2356,20 +2405,17 @@ NineDevice9_SetRenderState( struct NineDevice9 *This,
     }
 
     /* NV hack */
-    if (State == D3DRS_ADAPTIVETESS_Y &&
-        (Value == D3DFMT_ATOC || (Value == D3DFMT_UNKNOWN && state->rs[NINED3DRS_ALPHACOVERAGE]))) {
+    if (unlikely(State == D3DRS_ADAPTIVETESS_Y)) {
+        if (Value == D3DFMT_ATOC || (Value == D3DFMT_UNKNOWN && state->rs[NINED3DRS_ALPHACOVERAGE])) {
             state->rs[NINED3DRS_ALPHACOVERAGE] = (Value == D3DFMT_ATOC);
             state->changed.group |= NINE_STATE_BLEND;
             return D3D_OK;
+        }
     }
 
-    user_assert(State < Elements(state->rs), D3DERR_INVALIDCALL);
-
-    if (likely(state->rs[State] != Value) || unlikely(This->is_recording)) {
-        state->rs[State] = Value;
-        state->changed.rs[State / 32] |= 1 << (State % 32);
-        state->changed.group |= nine_render_state_group[State];
-    }
+    state->rs[State] = nine_fix_render_state_value(State, Value);
+    state->changed.rs[State / 32] |= 1 << (State % 32);
+    state->changed.group |= nine_render_state_group[State];
 
     return D3D_OK;
 }
@@ -2379,9 +2425,9 @@ NineDevice9_GetRenderState( struct NineDevice9 *This,
                             D3DRENDERSTATETYPE State,
                             DWORD *pValue )
 {
-    user_assert(State < Elements(This->state.rs), D3DERR_INVALIDCALL);
+    user_assert(State < D3DRS_COUNT, D3DERR_INVALIDCALL);
 
-    *pValue = This->state.rs[State];
+    *pValue = This->state.rs_advertised[State];
     return D3D_OK;
 }
 
@@ -3122,7 +3168,7 @@ NineDevice9_ProcessVertices( struct NineDevice9 *This,
         buffer_offset = 0;
     } else {
         /* SO matches vertex declaration */
-        resource = dst->base.resource;
+        resource = NineVertexBuffer9_GetResource(dst);
         buffer_offset = DestIndex * vs->so->stride[0];
     }
     target = This->pipe->create_stream_output_target(This->pipe, resource,
@@ -3184,13 +3230,21 @@ NineDevice9_SetVertexDeclaration( struct NineDevice9 *This,
                                   IDirect3DVertexDeclaration9 *pDecl )
 {
     struct nine_state *state = This->update;
+    BOOL was_programmable_vs = This->state.programmable_vs;
 
     DBG("This=%p pDecl=%p\n", This, pDecl);
 
     if (likely(!This->is_recording) && state->vdecl == NineVertexDeclaration9(pDecl))
         return D3D_OK;
+
     nine_bind(&state->vdecl, pDecl);
 
+    This->state.programmable_vs = This->state.vs && !(This->state.vdecl && This->state.vdecl->position_t);
+    if (likely(!This->is_recording) && was_programmable_vs != This->state.programmable_vs) {
+        state->commit |= NINE_STATE_COMMIT_CONST_VS;
+        state->changed.group |= NINE_STATE_VS;
+    }
+
     state->changed.group |= NINE_STATE_VDECL;
 
     return D3D_OK;
@@ -3262,18 +3316,21 @@ NineDevice9_SetVertexShader( struct NineDevice9 *This,
                              IDirect3DVertexShader9 *pShader )
 {
     struct nine_state *state = This->update;
+    BOOL was_programmable_vs = This->state.programmable_vs;
 
     DBG("This=%p pShader=%p\n", This, pShader);
 
     if (!This->is_recording && state->vs == (struct NineVertexShader9*)pShader)
       return D3D_OK;
 
-    /* ff -> non-ff: commit back non-ff constants */
-    if (!state->vs && pShader)
-        state->commit |= NINE_STATE_COMMIT_CONST_VS;
-
     nine_bind(&state->vs, pShader);
 
+    This->state.programmable_vs = This->state.vs && !(This->state.vdecl && This->state.vdecl->position_t);
+
+    /* ff -> non-ff: commit back non-ff constants */
+    if (!was_programmable_vs && This->state.programmable_vs)
+        state->commit |= NINE_STATE_COMMIT_CONST_VS;
+
     state->changed.group |= NINE_STATE_VS;
 
     return D3D_OK;
@@ -3499,7 +3556,8 @@ NineDevice9_SetStreamSource( struct NineDevice9 *This,
         state->vtxbuf[i].stride = Stride;
         state->vtxbuf[i].buffer_offset = OffsetInBytes;
     }
-    state->vtxbuf[i].buffer = pStreamData ? pVBuf9->base.resource : NULL;
+    pipe_resource_reference(&state->vtxbuf[i].buffer,
+                            pStreamData ? NineVertexBuffer9_GetResource(pVBuf9) : NULL);
 
     return D3D_OK;
 }
@@ -3542,6 +3600,9 @@ NineDevice9_SetStreamSourceFreq( struct NineDevice9 *This,
                   (Setting & D3DSTREAMSOURCE_INDEXEDDATA)), D3DERR_INVALIDCALL);
     user_assert(Setting, D3DERR_INVALIDCALL);
 
+    if (likely(!This->is_recording) && state->stream_freq[StreamNumber] == Setting)
+        return D3D_OK;
+
     state->stream_freq[StreamNumber] = Setting;
 
     if (Setting & D3DSTREAMSOURCE_INSTANCEDATA)
@@ -3549,7 +3610,9 @@ NineDevice9_SetStreamSourceFreq( struct NineDevice9 *This,
     else
         state->stream_instancedata_mask &= ~(1 << StreamNumber);
 
-    state->changed.stream_freq |= 1 << StreamNumber;
+    state->changed.stream_freq |= 1 << StreamNumber; /* Used for stateblocks */
+    if (StreamNumber != 0)
+        state->changed.group |= NINE_STATE_STREAMFREQ;
     return D3D_OK;
 }
 
@@ -4013,7 +4076,8 @@ NineDevice9_new( struct pipe_screen *pScreen,
                  struct d3dadapter9_context *pCTX,
                  boolean ex,
                  D3DDISPLAYMODEEX *pFullscreenDisplayMode,
-                 struct NineDevice9 **ppOut )
+                 struct NineDevice9 **ppOut,
+                 int minorVersionNum )
 {
     BOOL lock;
     lock = !!(pCreationParameters->BehaviorFlags & D3DCREATE_MULTITHREADED);
@@ -4021,5 +4085,5 @@ NineDevice9_new( struct pipe_screen *pScreen,
     NINE_NEW(Device9, ppOut, lock, /* args */
              pScreen, pCreationParameters, pCaps,
              pPresentationParameters, pD3D9, pPresentationGroup, pCTX,
-             ex, pFullscreenDisplayMode);
+             ex, pFullscreenDisplayMode, minorVersionNum );
 }
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index cbc1e61f5db..34edf0cfa48 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -137,6 +137,10 @@ struct NineDevice9
     /* dummy vbo (containing 0 0 0 0) to bind if vertex shader input
      * is not bound to anything by the vertex declaration */
     struct pipe_resource *dummy_vbo;
+    BOOL device_needs_reset;
+    int minor_version_num;
+    long long available_texture_mem;
+    long long available_texture_limit;
 };
 static inline struct NineDevice9 *
 NineDevice9( void *data )
@@ -154,7 +158,8 @@ NineDevice9_new( struct pipe_screen *pScreen,
                  struct d3dadapter9_context *pCTX,
                  boolean ex,
                  D3DDISPLAYMODEEX *pFullscreenDisplayMode,
-                 struct NineDevice9 **ppOut );
+                 struct NineDevice9 **ppOut,
+                 int minorVersionNum );
 
 HRESULT
 NineDevice9_ctor( struct NineDevice9 *This,
@@ -167,12 +172,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
                   ID3DPresentGroup *pPresentationGroup,
                   struct d3dadapter9_context *pCTX,
                   boolean ex,
-                  D3DDISPLAYMODEEX *pFullscreenDisplayMode );
+                  D3DDISPLAYMODEEX *pFullscreenDisplayMode,
+                  int minorVersionNum );
 
 void
 NineDevice9_dtor( struct NineDevice9 *This );
 
 /*** Nine private ***/
+void
+NineDevice9_SetDefaultState( struct NineDevice9 *This, boolean is_reset );
 
 struct pipe_screen *
 NineDevice9_GetScreen( struct NineDevice9 *This );
diff --git a/src/gallium/state_trackers/nine/device9ex.c b/src/gallium/state_trackers/nine/device9ex.c
index fe8aa9b2704..11244b1bedf 100644
--- a/src/gallium/state_trackers/nine/device9ex.c
+++ b/src/gallium/state_trackers/nine/device9ex.c
@@ -20,7 +20,9 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "device9.h"
 #include "device9ex.h"
+#include "nine_pipe.h"
 #include "swapchain9ex.h"
 
 #include "nine_helpers.h"
@@ -37,7 +39,8 @@ NineDevice9Ex_ctor( struct NineDevice9Ex *This,
                     D3DDISPLAYMODEEX *pFullscreenDisplayMode,
                     IDirect3D9Ex *pD3D9Ex,
                     ID3DPresentGroup *pPresentationGroup,
-                    struct d3dadapter9_context *pCTX )
+                    struct d3dadapter9_context *pCTX,
+                    int minorVersionNum )
 {
     DBG("This=%p pParams=%p pScreen=%p pCreationParameters=%p pCaps=%p "
         "pPresentationParameters=%p pFullscreenDisplayMode=%p "
@@ -50,7 +53,7 @@ NineDevice9Ex_ctor( struct NineDevice9Ex *This,
                             pScreen, pCreationParameters, pCaps,
                             pPresentationParameters,
                             (IDirect3D9 *)pD3D9Ex, pPresentationGroup, pCTX,
-                            TRUE, pFullscreenDisplayMode);
+                            TRUE, pFullscreenDisplayMode, minorVersionNum);
 }
 
 static void
@@ -158,6 +161,14 @@ NineDevice9Ex_CheckDeviceState( struct NineDevice9Ex *This,
     DBG("This=%p hDestinationWindow=%p\n",
         This, hDestinationWindow);
 
+    user_assert(!This->base.swapchains[0]->params.Windowed, D3D_OK);
+
+    if (This->base.params.hFocusWindow == hDestinationWindow) {
+        if (NineSwapChain9_GetOccluded(This->base.swapchains[0]))
+            return S_PRESENT_OCCLUDED;
+    } else if(!NineSwapChain9_GetOccluded(This->base.swapchains[0])) {
+        return S_PRESENT_OCCLUDED;
+    }
     /* TODO: handle the other return values */
     return D3D_OK;
 }
@@ -221,12 +232,37 @@ NineDevice9Ex_ResetEx( struct NineDevice9Ex *This,
         if (pFullscreenDisplayMode) mode = &(pFullscreenDisplayMode[i]);
         hr = NineSwapChain9_Resize(This->base.swapchains[i], params, mode);
         if (FAILED(hr))
-            return (hr == D3DERR_OUTOFVIDEOMEMORY) ? hr : D3DERR_DEVICELOST;
+            break;
     }
 
     NineDevice9_SetRenderTarget(
         (struct NineDevice9 *)This, 0, (IDirect3DSurface9 *)This->base.swapchains[0]->buffers[0]);
 
+    return hr;
+}
+
+HRESULT WINAPI
+NineDevice9Ex_Reset( struct NineDevice9Ex *This,
+                     D3DPRESENT_PARAMETERS *pPresentationParameters )
+{
+    HRESULT hr = D3D_OK;
+    unsigned i;
+
+    DBG("This=%p pPresentationParameters=%p\n", This, pPresentationParameters);
+
+    for (i = 0; i < This->base.nswapchains; ++i) {
+        D3DPRESENT_PARAMETERS *params = &pPresentationParameters[i];
+        hr = NineSwapChain9_Resize(This->base.swapchains[i], params, NULL);
+        if (FAILED(hr))
+            break;
+    }
+
+    nine_pipe_context_clear((struct NineDevice9 *)This);
+    nine_state_clear(&This->base.state, TRUE);
+
+    NineDevice9_SetDefaultState((struct NineDevice9 *)This, TRUE);
+    NineDevice9_SetRenderTarget(
+        (struct NineDevice9 *)This, 0, (IDirect3DSurface9 *)This->base.swapchains[0]->buffers[0]);
 
     return hr;
 }
@@ -248,11 +284,18 @@ NineDevice9Ex_GetDisplayModeEx( struct NineDevice9Ex *This,
     return NineSwapChain9Ex_GetDisplayModeEx(swapchain, pMode, pRotation);
 }
 
+HRESULT WINAPI
+NineDevice9Ex_TestCooperativeLevel( struct NineDevice9Ex *This )
+{
+    return D3D_OK;
+}
+
+
 IDirect3DDevice9ExVtbl NineDevice9Ex_vtable = {
     (void *)NineUnknown_QueryInterface,
     (void *)NineUnknown_AddRef,
     (void *)NineUnknown_Release,
-    (void *)NineDevice9_TestCooperativeLevel,
+    (void *)NineDevice9Ex_TestCooperativeLevel,
     (void *)NineDevice9_GetAvailableTextureMem,
     (void *)NineDevice9_EvictManagedResources,
     (void *)NineDevice9_GetDirect3D,
@@ -265,7 +308,7 @@ IDirect3DDevice9ExVtbl NineDevice9Ex_vtable = {
     (void *)NineDevice9_CreateAdditionalSwapChain,
     (void *)NineDevice9_GetSwapChain,
     (void *)NineDevice9_GetNumberOfSwapChains,
-    (void *)NineDevice9_Reset,
+    (void *)NineDevice9Ex_Reset,
     (void *)NineDevice9_Present,
     (void *)NineDevice9_GetBackBuffer,
     (void *)NineDevice9_GetRasterStatus,
@@ -401,13 +444,14 @@ NineDevice9Ex_new( struct pipe_screen *pScreen,
                    IDirect3D9Ex *pD3D9Ex,
                    ID3DPresentGroup *pPresentationGroup,
                    struct d3dadapter9_context *pCTX,
-                   struct NineDevice9Ex **ppOut )
+                   struct NineDevice9Ex **ppOut,
+                   int minorVersionNum )
 {
     BOOL lock;
     lock = !!(pCreationParameters->BehaviorFlags & D3DCREATE_MULTITHREADED);
 
     NINE_NEW(Device9Ex, ppOut, lock,
              pScreen, pCreationParameters, pCaps, pPresentationParameters,
-             pFullscreenDisplayMode, pD3D9Ex, pPresentationGroup, pCTX);
+             pFullscreenDisplayMode, pD3D9Ex, pPresentationGroup, pCTX, minorVersionNum );
 }
 
diff --git a/src/gallium/state_trackers/nine/device9ex.h b/src/gallium/state_trackers/nine/device9ex.h
index 8375622d8a1..1c7e57e0974 100644
--- a/src/gallium/state_trackers/nine/device9ex.h
+++ b/src/gallium/state_trackers/nine/device9ex.h
@@ -44,7 +44,8 @@ NineDevice9Ex_new( struct pipe_screen *pScreen,
                    IDirect3D9Ex *pD3D9Ex,
                    ID3DPresentGroup *pPresentationGroup,
                    struct d3dadapter9_context *pCTX,
-                   struct NineDevice9Ex **ppOut );
+                   struct NineDevice9Ex **ppOut,
+                   int minorVersionNum );
 
 HRESULT WINAPI
 NineDevice9Ex_SetConvolutionMonoKernel( struct NineDevice9Ex *This,
@@ -72,6 +73,13 @@ NineDevice9Ex_PresentEx( struct NineDevice9Ex *This,
                          const RGNDATA *pDirtyRegion,
                          DWORD dwFlags );
 
+HRESULT WINAPI
+NineDevice9Ex_Present( struct NineDevice9Ex *This,
+                     const RECT *pSourceRect,
+                     const RECT *pDestRect,
+                     HWND hDestWindowOverride,
+                     const RGNDATA *pDirtyRegion );
+
 HRESULT WINAPI
 NineDevice9Ex_GetGPUThreadPriority( struct NineDevice9Ex *This,
                                     INT *pPriority );
@@ -140,10 +148,17 @@ NineDevice9Ex_ResetEx( struct NineDevice9Ex *This,
                        D3DPRESENT_PARAMETERS *pPresentationParameters,
                        D3DDISPLAYMODEEX *pFullscreenDisplayMode );
 
+HRESULT WINAPI
+NineDevice9Ex_Reset( struct NineDevice9Ex *This,
+                     D3DPRESENT_PARAMETERS *pPresentationParameters );
+
 HRESULT WINAPI
 NineDevice9Ex_GetDisplayModeEx( struct NineDevice9Ex *This,
                                 UINT iSwapChain,
                                 D3DDISPLAYMODEEX *pMode,
                                 D3DDISPLAYROTATION *pRotation );
 
+HRESULT WINAPI
+NineDevice9Ex_TestCooperativeLevel( struct NineDevice9Ex *This );
+
 #endif /* _NINE_DEVICE9EX_H_ */
diff --git a/src/gallium/state_trackers/nine/guid.c b/src/gallium/state_trackers/nine/guid.c
index 5034feb4d71..5e63d2f6629 100644
--- a/src/gallium/state_trackers/nine/guid.c
+++ b/src/gallium/state_trackers/nine/guid.c
@@ -20,6 +20,7 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include <stdio.h>
 #include "guid.h"
 
 const GUID IID_IUnknown = { 0x00000000, 0x0000, 0x0000, { 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46 } };
@@ -64,3 +65,20 @@ GUID_equal( const GUID *a,
     }
     return TRUE;
 }
+
+char* GUID_sprintf(char *guid_str, REFGUID id) {
+    sprintf( guid_str,
+             "{%08X,%04X,%04X,%02X%02X%02X%02X%02X%02X%02X%02X}",
+             id->Data1,
+             id->Data2,
+             id->Data3,
+             id->Data4[0],
+             id->Data4[1],
+             id->Data4[2],
+             id->Data4[3],
+             id->Data4[4],
+             id->Data4[5],
+             id->Data4[6],
+             id->Data4[7]);
+    return guid_str;
+}
diff --git a/src/gallium/state_trackers/nine/guid.h b/src/gallium/state_trackers/nine/guid.h
index 1f9ff009ad8..af8f081bfb5 100644
--- a/src/gallium/state_trackers/nine/guid.h
+++ b/src/gallium/state_trackers/nine/guid.h
@@ -33,4 +33,8 @@ boolean
 GUID_equal( const GUID *a,
             const GUID *b );
 
+char*
+GUID_sprintf( char *guid_str,
+              REFGUID id );
+
 #endif /* _NINE_GUID_H_ */
diff --git a/src/gallium/state_trackers/nine/indexbuffer9.c b/src/gallium/state_trackers/nine/indexbuffer9.c
index 860313b7f7e..401fe75e95f 100644
--- a/src/gallium/state_trackers/nine/indexbuffer9.c
+++ b/src/gallium/state_trackers/nine/indexbuffer9.c
@@ -40,52 +40,17 @@ NineIndexBuffer9_ctor( struct NineIndexBuffer9 *This,
                        struct NineUnknownParams *pParams,
                        D3DINDEXBUFFER_DESC *pDesc )
 {
-    struct pipe_resource *info = &This->base.info;
     HRESULT hr;
     DBG("This=%p pParams=%p pDesc=%p Usage=%s\n",
          This, pParams, pDesc, nine_D3DUSAGE_to_str(pDesc->Usage));
 
-    This->pipe = pParams->device->pipe;
-
-    info->screen = pParams->device->screen;
-    info->target = PIPE_BUFFER;
-    info->format = PIPE_FORMAT_R8_UNORM;
-    info->width0 = pDesc->Size;
-    info->flags = 0;
-
-    info->bind = PIPE_BIND_INDEX_BUFFER | PIPE_BIND_TRANSFER_WRITE;
-    if (!(pDesc->Usage & D3DUSAGE_WRITEONLY))
-        info->bind |= PIPE_BIND_TRANSFER_READ;
-
-    info->usage = PIPE_USAGE_DEFAULT;
-    if (pDesc->Usage & D3DUSAGE_DYNAMIC)
-        info->usage = PIPE_USAGE_STREAM;
-    if (pDesc->Pool == D3DPOOL_SYSTEMMEM)
-        info->usage = PIPE_USAGE_STAGING;
-
-    /* if (pDesc->Usage & D3DUSAGE_DONOTCLIP) { } */
-    /* if (pDesc->Usage & D3DUSAGE_NONSECURE) { } */
-    /* if (pDesc->Usage & D3DUSAGE_NPATCHES) { } */
-    /* if (pDesc->Usage & D3DUSAGE_POINTS) { } */
-    /* if (pDesc->Usage & D3DUSAGE_RTPATCHES) { } */
-    if (pDesc->Usage & D3DUSAGE_SOFTWAREPROCESSING)
-        DBG("Application asked for Software Vertex Processing, "
-            "but this is unimplemented\n");
-
-    info->height0 = 1;
-    info->depth0 = 1;
-    info->array_size = 1;
-    info->last_level = 0;
-    info->nr_samples = 0;
-
-    hr = NineResource9_ctor(&This->base, pParams, NULL, TRUE, D3DRTYPE_INDEXBUFFER,
-                            pDesc->Pool, pDesc->Usage);
+    hr = NineBuffer9_ctor(&This->base, pParams, D3DRTYPE_INDEXBUFFER,
+                          pDesc->Usage, pDesc->Size, pDesc->Pool);
     if (FAILED(hr))
         return hr;
 
-    This->buffer.buffer = This->base.resource;
+    This->buffer.buffer = NineIndexBuffer9_GetResource(This);
     This->buffer.offset = 0;
-    This->map_count = 0;
 
     switch (pDesc->Format) {
     case D3DFMT_INDEX16: This->buffer.index_size = 2; break;
@@ -105,9 +70,7 @@ NineIndexBuffer9_ctor( struct NineIndexBuffer9 *This,
 void
 NineIndexBuffer9_dtor( struct NineIndexBuffer9 *This )
 {
-    if (This->transfer) { NineIndexBuffer9_Unlock(This); }
-
-    NineResource9_dtor(&This->base);
+    NineBuffer9_dtor(&This->base);
 }
 
 const struct pipe_index_buffer *
@@ -116,6 +79,12 @@ NineIndexBuffer9_GetBuffer( struct NineIndexBuffer9 *This )
     return &This->buffer;
 }
 
+struct pipe_resource *
+NineIndexBuffer9_GetResource( struct NineIndexBuffer9 *This )
+{
+    return NineBuffer9_GetResource(&This->base);
+}
+
 HRESULT WINAPI
 NineIndexBuffer9_Lock( struct NineIndexBuffer9 *This,
                        UINT OffsetToLock,
@@ -123,59 +92,13 @@ NineIndexBuffer9_Lock( struct NineIndexBuffer9 *This,
                        void **ppbData,
                        DWORD Flags )
 {
-    struct pipe_box box;
-    void *data;
-    UINT count;
-    const unsigned usage = d3dlock_buffer_to_pipe_transfer_usage(Flags);
-
-    DBG("This=%p OffsetToLock=%u SizeToLock=%u ppbData=%p Flags=%i "
-        "transfer=%p map_count=%u\n", This, OffsetToLock,
-	SizeToLock, ppbData, Flags, This->transfer, This->map_count);
-
-    count = ++This->map_count;
-
-    if (SizeToLock == 0) {
-        SizeToLock = This->desc.Size - OffsetToLock;
-        user_warn(OffsetToLock != 0);
-    }
-
-    u_box_1d(OffsetToLock, SizeToLock, &box);
-
-    if (unlikely(count != 1)) {
-        DBG("Lock has been called on already locked buffer."
-	    "Unmapping before mapping again.");
-        This->pipe->transfer_unmap(This->pipe, This->transfer);
-    }
-    data = This->pipe->transfer_map(This->pipe, This->base.resource, 0,
-                                    usage, &box, &This->transfer);
-    if (!This->transfer) {
-        DBG("pipe::transfer_map failed\n"
-            " usage = %u\n"
-            " box.x = %u\n"
-            " box.width = %u\n",
-            usage, box.x, box.width);
-    }
-    *ppbData = data;
-    DBG("Returning memory at %p at address %p\n", *ppbData, ppbData);
-
-    return D3D_OK;
+    return NineBuffer9_Lock(&This->base, OffsetToLock, SizeToLock, ppbData, Flags);
 }
 
 HRESULT WINAPI
 NineIndexBuffer9_Unlock( struct NineIndexBuffer9 *This )
 {
-    DBG("This=%p\n", This);
-    if (!This->map_count) {
-        DBG("Unmap called without a previous map call.\n");
-        return D3D_OK;
-    }
-    if (--This->map_count) {
-        DBG("Ignoring unmap.\n");
-	return D3D_OK;
-    }
-    This->pipe->transfer_unmap(This->pipe, This->transfer);
-    This->transfer = NULL;
-    return D3D_OK;
+    return NineBuffer9_Unlock(&This->base);
 }
 
 HRESULT WINAPI
diff --git a/src/gallium/state_trackers/nine/indexbuffer9.h b/src/gallium/state_trackers/nine/indexbuffer9.h
index f10578f47ba..f3274b71224 100644
--- a/src/gallium/state_trackers/nine/indexbuffer9.h
+++ b/src/gallium/state_trackers/nine/indexbuffer9.h
@@ -24,7 +24,7 @@
 #define _NINE_INDEXBUFFER9_H_
 
 #include "resource9.h"
-
+#include "buffer9.h"
 #include "pipe/p_state.h"
 
 struct pipe_screen;
@@ -35,13 +35,10 @@ struct NineDevice9;
 
 struct NineIndexBuffer9
 {
-    struct NineResource9 base;
+    struct NineBuffer9 base;
 
     /* g3d stuff */
-    struct pipe_context *pipe;
     struct pipe_index_buffer buffer;
-    struct pipe_transfer *transfer;
-    UINT map_count;
 
     D3DINDEXBUFFER_DESC desc;
 };
@@ -69,6 +66,8 @@ NineIndexBuffer9_dtor( struct NineIndexBuffer9 *This );
 const struct pipe_index_buffer *
 NineIndexBuffer9_GetBuffer( struct NineIndexBuffer9 *This );
 
+struct pipe_resource *
+NineIndexBuffer9_GetResource( struct NineIndexBuffer9 *This );
 /*** Direct3D public ***/
 
 HRESULT WINAPI
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index 0feaeab7330..a5466a7bdd4 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -58,7 +58,8 @@ struct nine_ff_vs_key
             uint32_t color0in_one : 1;
             uint32_t color1in_one : 1;
             uint32_t fog : 1;
-            uint32_t pad1 : 7;
+            uint32_t specular_enable : 1;
+            uint32_t pad1 : 6;
             uint32_t tc_dim_input: 16; /* 8 * 2 bits */
             uint32_t pad2 : 16;
             uint32_t tc_dim_output: 24; /* 8 * 3 bits */
@@ -466,6 +467,10 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 224.0f));
             ureg_ARL(ureg, AR, ureg_src(tmp));
         }
+
+        ureg_MOV(ureg, r[2], ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
+        ureg_MOV(ureg, r[3], ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
+
         for (i = 0; i < key->vertexblend; ++i) {
             for (c = 0; c < 4; ++c) {
                 cWM[c] = ureg_src_register(TGSI_FILE_CONSTANT, (224 + i * 4) * !key->vertexblend_indexed + c);
@@ -473,22 +478,27 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
             }
             /* multiply by WORLD(index) */
-            ureg_MUL(ureg, r[0], _XXXX(vs->aVtx), cWM[0]);
-            ureg_MAD(ureg, r[0], _YYYY(vs->aVtx), cWM[1], ureg_src(r[0]));
-            ureg_MAD(ureg, r[0], _ZZZZ(vs->aVtx), cWM[2], ureg_src(r[0]));
-            ureg_MAD(ureg, r[0], _WWWW(vs->aVtx), cWM[3], ureg_src(r[0]));
+            ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
+            ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
+            ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
+            ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
 
-            /* accumulate weighted position value */
-            if (i)
-                ureg_MAD(ureg, r[2], ureg_src(r[0]), ureg_scalar(vs->aWgt, i), ureg_src(r[2]));
-            else
-                ureg_MUL(ureg, r[2], ureg_src(r[0]), ureg_scalar(vs->aWgt, 0));
+            if (i < (key->vertexblend - 1)) {
+                /* accumulate weighted position value */
+                ureg_MAD(ureg, r[2], ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(r[2]));
+                /* subtract weighted position value for last value */
+                ureg_SUB(ureg, r[3], ureg_src(r[3]), ureg_scalar(vs->aWgt, i));
+            }
         }
+
+        /* the last weighted position is always 1 - sum_of_previous_weights */
+        ureg_MAD(ureg, r[2], ureg_src(tmp), ureg_scalar(ureg_src(r[3]), key->vertexblend - 1), ureg_src(r[2]));
+
         /* multiply by VIEW_PROJ */
-        ureg_MUL(ureg, r[0], _X(r[2]), _CONST(8));
-        ureg_MAD(ureg, r[0], _Y(r[2]), _CONST(9),  ureg_src(r[0]));
-        ureg_MAD(ureg, r[0], _Z(r[2]), _CONST(10), ureg_src(r[0]));
-        ureg_MAD(ureg, oPos, _W(r[2]), _CONST(11), ureg_src(r[0]));
+        ureg_MUL(ureg, tmp, _X(r[2]), _CONST(8));
+        ureg_MAD(ureg, tmp, _Y(r[2]), _CONST(9),  ureg_src(tmp));
+        ureg_MAD(ureg, tmp, _Z(r[2]), _CONST(10), ureg_src(tmp));
+        ureg_MAD(ureg, oPos, _W(r[2]), _CONST(11), ureg_src(tmp));
 
         if (need_rVtx)
             vs->aVtx = ureg_src(r[2]);
@@ -515,10 +525,10 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
         ureg_MOV(ureg, oPos, ureg_src(tmp));
     } else {
         /* position = vertex * WORLD_VIEW_PROJ */
-        ureg_MUL(ureg, r[0], _XXXX(vs->aVtx), _CONST(0));
-        ureg_MAD(ureg, r[0], _YYYY(vs->aVtx), _CONST(1), ureg_src(r[0]));
-        ureg_MAD(ureg, r[0], _ZZZZ(vs->aVtx), _CONST(2), ureg_src(r[0]));
-        ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(r[0]));
+        ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
+        ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
+        ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
+        ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
     }
 
     if (need_rVtx) {
@@ -746,12 +756,10 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
         {
             /* hitDir = light.position - eyeVtx
              * d = length(hitDir)
-             * hitDir /= d
              */
             ureg_SUB(ureg, rHit, cLPos, ureg_src(rVtx));
             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
             ureg_RSQ(ureg, tmp_y, _X(tmp));
-            ureg_MUL(ureg, rHit, ureg_src(rHit), _Y(tmp)); /* normalize */
             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
 
             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
@@ -765,6 +773,9 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
         ureg_ENDIF(ureg);
 
+        /* normalize hitDir */
+        ureg_normalize3(ureg, rHit, ureg_src(rHit), tmp);
+
         /* if (SPOT light) */
         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
         ureg_IF(ureg, _X(tmp), &label[l++]);
@@ -799,9 +810,9 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
             /* midVec = normalize(hitDir + eyeDir) */
             if (key->localviewer) {
                 ureg_normalize3(ureg, rMid, ureg_src(rVtx), tmp);
-                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
+                ureg_SUB(ureg, rMid, ureg_src(rHit), ureg_src(rMid));
             } else {
-                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, 1.0f));
+                ureg_SUB(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, 1.0f));
             }
             ureg_normalize3(ureg, rMid, ureg_src(rMid), tmp);
             ureg_DP3(ureg, ureg_saturate(tmp_y), ureg_src(rNrm), ureg_src(rMid));
@@ -849,7 +860,14 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W  ), vs->mtlA, vs->mtlE);
         }
-        ureg_MAD(ureg, oCol[0], ureg_src(rD), vs->mtlD, ureg_src(tmp));
+
+        if (key->specular_enable) {
+            /* add oCol[1] to oCol[0] */
+            ureg_MAD(ureg, tmp, ureg_src(rD), vs->mtlD, ureg_src(tmp));
+            ureg_MAD(ureg, oCol[0], ureg_src(rS), vs->mtlS, ureg_src(tmp));
+        } else {
+            ureg_MAD(ureg, oCol[0], ureg_src(rD), vs->mtlD, ureg_src(tmp));
+        }
         ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
     } else
     /* COLOR */
@@ -1012,10 +1030,10 @@ ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
         break;
     case D3DTA_DIFFUSE:
-        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_PERSPECTIVE);
+        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
         break;
     case D3DTA_SPECULAR:
-        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
+        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
         break;
     case D3DTA_TEMP:
         reg = ps->rTmpSrc;
@@ -1222,7 +1240,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
     ps.ureg = ureg;
     ps.stage.index_pre_mod = -1;
 
-    ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_PERSPECTIVE);
+    ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
 
     /* Declare all TEMPs we might need, serious drivers have a register allocator. */
     for (i = 0; i < Elements(ps.r); ++i)
@@ -1241,7 +1259,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
-                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
+                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
 
             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
@@ -1258,7 +1276,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
-                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
+                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
 
             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
@@ -1269,7 +1287,7 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
         }
     }
     if (key->specular)
-        ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
+        ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
 
     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
 
@@ -1500,6 +1518,9 @@ nine_ff_get_vs(struct NineDevice9 *device)
     if (key.fog_mode)
         key.fog_range = !key.position_t && state->rs[D3DRS_RANGEFOGENABLE];
 
+    key.localviewer = !!state->rs[D3DRS_LOCALVIEWER];
+    key.specular_enable = !!state->rs[D3DRS_SPECULARENABLE];
+
     if (state->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
         key.vertexblend_indexed = !!state->rs[D3DRS_INDEXEDVERTEXBLENDENABLE];
 
@@ -1847,7 +1868,7 @@ nine_ff_update(struct NineDevice9 *device)
     DBG("vs=%p ps=%p\n", device->state.vs, device->state.ps);
 
     /* NOTE: the only reference belongs to the hash table */
-    if (!device->state.vs) {
+    if (!state->programmable_vs) {
         device->ff.vs = nine_ff_get_vs(device);
         device->state.changed.group |= NINE_STATE_VS;
     }
@@ -1856,7 +1877,7 @@ nine_ff_update(struct NineDevice9 *device)
         device->state.changed.group |= NINE_STATE_PS;
     }
 
-    if (!device->state.vs) {
+    if (!state->programmable_vs) {
         nine_ff_load_vs_transforms(device);
         nine_ff_load_tex_matrices(device);
         nine_ff_load_lights(device);
diff --git a/src/gallium/state_trackers/nine/nine_limits.h b/src/gallium/state_trackers/nine/nine_limits.h
new file mode 100644
index 00000000000..ef1ed2566ba
--- /dev/null
+++ b/src/gallium/state_trackers/nine/nine_limits.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2015 Axel Davy <axel.davy@ens.fr>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef _NINE_LIMITS_H_
+#define _NINE_LIMITS_H_
+
+#include "assert.h"
+#include "d3d9types.h"
+
+// state can be any value
+#define NINE_STATE_NO_LIMIT 0
+// value is clamped if below min or max
+#define NINE_STATE_CLAMP 1
+// boolean: 0 -> false; any other value -> true
+#define NINE_STATE_BOOL 2
+// a mask is applied on the value
+#define NINE_STATE_MASK 3
+// if outside a range, state value is changed to a default value
+#define NINE_STATE_RANGE_DEF_VAL 4
+
+struct nine_state_behaviour {
+  unsigned state_value_behaviour;
+  union {
+    struct {
+      unsigned min;
+      unsigned max;
+    } clamp;
+    unsigned mask;
+    struct {
+      unsigned min;
+      unsigned max;
+      unsigned default_val;
+    } range_def_val;
+  } u;
+};
+
+#define __NO_LIMIT_RS(o) \
+    [D3DRS_##o] = {NINE_STATE_NO_LIMIT}
+
+#define __CLAMP_RS(o, m, M) \
+    [D3DRS_##o] = {NINE_STATE_CLAMP, {.clamp = {m, M}}}
+
+#define __BOOLEAN_RS(o) \
+    [D3DRS_##o] = {NINE_STATE_BOOL}
+
+#define __MASK_RS(o, m) \
+    [D3DRS_##o] = {NINE_STATE_MASK, {.mask = m}}
+
+#define __RANGE_DEF_VAL_RS(o, m, M, d) \
+    [D3DRS_##o] = {NINE_STATE_RANGE_DEF_VAL, {.range_def_val = {m, M, d}}}
+
+#define __TO_DETERMINE_RS(o, m, M) \
+    [D3DRS_##o] = {NINE_STATE_NO_LIMIT}
+
+static const struct nine_state_behaviour
+render_state_limits_table[D3DRS_BLENDOPALPHA + 1] = {
+    __TO_DETERMINE_RS(ZENABLE, 0, 3),
+    __TO_DETERMINE_RS(FILLMODE, 1, 3),
+    __CLAMP_RS(SHADEMODE, 1, 3),
+    __BOOLEAN_RS(ZWRITEENABLE),
+    __BOOLEAN_RS(ALPHATESTENABLE),
+    __BOOLEAN_RS(LASTPIXEL),
+    __RANGE_DEF_VAL_RS(SRCBLEND, 1, 17, D3DBLEND_ZERO),
+    __RANGE_DEF_VAL_RS(DESTBLEND, 1, 17, D3DBLEND_ZERO),
+    __CLAMP_RS(CULLMODE, 1, 3),
+    __CLAMP_RS(ZFUNC, 1, 8),
+    __MASK_RS(ALPHAREF, 0x000000FF),
+    __CLAMP_RS(ALPHAFUNC, 1, 8),
+    __BOOLEAN_RS(DITHERENABLE),
+    __BOOLEAN_RS(ALPHABLENDENABLE),
+    __BOOLEAN_RS(FOGENABLE),
+    __BOOLEAN_RS(SPECULARENABLE),
+    __NO_LIMIT_RS(FOGCOLOR),
+    __MASK_RS(FOGTABLEMODE, 0x00000007),
+    __NO_LIMIT_RS(FOGSTART), /* a bit more complex than that, lets ignore */
+    __NO_LIMIT_RS(FOGEND),
+    __NO_LIMIT_RS(FOGDENSITY), /* actually should be between 0.0 and 1.0 */
+    __BOOLEAN_RS(RANGEFOGENABLE),
+    __BOOLEAN_RS(STENCILENABLE),
+    __CLAMP_RS(STENCILFAIL, 1, 8),
+    __CLAMP_RS(STENCILZFAIL, 1, 8),
+    __CLAMP_RS(STENCILPASS, 1, 8),
+    __CLAMP_RS(STENCILFUNC, 1, 8),
+    __NO_LIMIT_RS(STENCILREF),
+    __NO_LIMIT_RS(STENCILMASK),
+    __NO_LIMIT_RS(STENCILWRITEMASK),
+    __NO_LIMIT_RS(TEXTUREFACTOR),
+    __TO_DETERMINE_RS(WRAP0, 0, 15),
+    __TO_DETERMINE_RS(WRAP1, 0, 15),
+    __TO_DETERMINE_RS(WRAP2, 0, 15),
+    __TO_DETERMINE_RS(WRAP3, 0, 15),
+    __TO_DETERMINE_RS(WRAP4, 0, 15),
+    __TO_DETERMINE_RS(WRAP5, 0, 15),
+    __TO_DETERMINE_RS(WRAP6, 0, 15),
+    __TO_DETERMINE_RS(WRAP7, 0, 15),
+    __BOOLEAN_RS(CLIPPING),
+    __BOOLEAN_RS(LIGHTING),
+    __NO_LIMIT_RS(AMBIENT),
+    __MASK_RS(FOGVERTEXMODE, 0x00000007),
+    __BOOLEAN_RS(COLORVERTEX),
+    __BOOLEAN_RS(LOCALVIEWER),
+    __BOOLEAN_RS(NORMALIZENORMALS),
+    __TO_DETERMINE_RS(DIFFUSEMATERIALSOURCE, 0, 2),
+    __TO_DETERMINE_RS(SPECULARMATERIALSOURCE, 0, 2),
+    __TO_DETERMINE_RS(AMBIENTMATERIALSOURCE, 0, 2),
+    __TO_DETERMINE_RS(EMISSIVEMATERIALSOURCE, 0, 2),
+    __TO_DETERMINE_RS(VERTEXBLEND, 0, 256), /* values between 4 and 254 -both included- are forbidden too */
+    __NO_LIMIT_RS(CLIPPLANEENABLE), /* expected check seems complex */
+    __TO_DETERMINE_RS(POINTSIZE, 0, 0xFFFFFFFF),
+    __TO_DETERMINE_RS(POINTSIZE_MIN, 0, 0x7FFFFFFF), /* float >= 0.0 */
+    __BOOLEAN_RS(POINTSPRITEENABLE),
+    __BOOLEAN_RS(POINTSCALEENABLE),
+    __TO_DETERMINE_RS(POINTSCALE_A, 0, 0x7FFFFFFF), /* float >= 0.0 */
+    __TO_DETERMINE_RS(POINTSCALE_B, 0, 0x7FFFFFFF), /* float >= 0.0 */
+    __TO_DETERMINE_RS(POINTSCALE_C, 0, 0x7FFFFFFF), /* float >= 0.0 */
+    __BOOLEAN_RS(MULTISAMPLEANTIALIAS),
+    __NO_LIMIT_RS(MULTISAMPLEMASK),
+    __TO_DETERMINE_RS(PATCHEDGESTYLE, 0, 1),
+    __TO_DETERMINE_RS(DEBUGMONITORTOKEN, 0, 1),
+    __TO_DETERMINE_RS(POINTSIZE_MAX, 0, 0x7FFFFFFF), /* check more complex than that */
+    __BOOLEAN_RS(INDEXEDVERTEXBLENDENABLE),
+    __TO_DETERMINE_RS(COLORWRITEENABLE, 0, 15),
+    __NO_LIMIT_RS(TWEENFACTOR),
+    __CLAMP_RS(BLENDOP, 1, 5),
+    __TO_DETERMINE_RS(POSITIONDEGREE, 1, 5), /* can actually be only 1 or 5 */
+    __TO_DETERMINE_RS(NORMALDEGREE, 1, 2),
+    __BOOLEAN_RS(SCISSORTESTENABLE),
+    __NO_LIMIT_RS(SLOPESCALEDEPTHBIAS),
+    __BOOLEAN_RS(ANTIALIASEDLINEENABLE),
+    __NO_LIMIT_RS(MINTESSELLATIONLEVEL),
+    __NO_LIMIT_RS(MAXTESSELLATIONLEVEL),
+    __NO_LIMIT_RS(ADAPTIVETESS_X),
+    __NO_LIMIT_RS(ADAPTIVETESS_Y),
+    __NO_LIMIT_RS(ADAPTIVETESS_Z),
+    __NO_LIMIT_RS(ADAPTIVETESS_W),
+    __BOOLEAN_RS(ENABLEADAPTIVETESSELLATION),
+    __BOOLEAN_RS(TWOSIDEDSTENCILMODE),
+    __CLAMP_RS(CCW_STENCILFAIL, 1, 8),
+    __CLAMP_RS(CCW_STENCILZFAIL, 1, 8),
+    __CLAMP_RS(CCW_STENCILPASS, 1, 8),
+    __CLAMP_RS(CCW_STENCILFUNC, 1, 8),
+    __TO_DETERMINE_RS(COLORWRITEENABLE1, 0, 15),
+    __TO_DETERMINE_RS(COLORWRITEENABLE2, 0, 15),
+    __TO_DETERMINE_RS(COLORWRITEENABLE3, 0, 15),
+    __NO_LIMIT_RS(BLENDFACTOR),
+    __BOOLEAN_RS(SRGBWRITEENABLE),
+    __NO_LIMIT_RS(DEPTHBIAS),
+    __TO_DETERMINE_RS(WRAP8, 0, 15),
+    __TO_DETERMINE_RS(WRAP9, 0, 15),
+    __TO_DETERMINE_RS(WRAP10, 0, 15),
+    __TO_DETERMINE_RS(WRAP11, 0, 15),
+    __TO_DETERMINE_RS(WRAP12, 0, 15),
+    __TO_DETERMINE_RS(WRAP13, 0, 15),
+    __TO_DETERMINE_RS(WRAP14, 0, 15),
+    __TO_DETERMINE_RS(WRAP15, 0, 15),
+    __BOOLEAN_RS(SEPARATEALPHABLENDENABLE),
+    __RANGE_DEF_VAL_RS(SRCBLENDALPHA, 1, 17, D3DBLEND_ZERO),
+    __RANGE_DEF_VAL_RS(DESTBLENDALPHA, 1, 17, D3DBLEND_ZERO),
+    __CLAMP_RS(BLENDOPALPHA, 1, 5)
+};
+
+static DWORD inline
+nine_fix_render_state_value(D3DRENDERSTATETYPE State,
+                            DWORD Value)
+{
+    struct nine_state_behaviour behaviour = render_state_limits_table[State];
+
+    switch (behaviour.state_value_behaviour) {
+    case NINE_STATE_NO_LIMIT:
+        break;
+    case NINE_STATE_CLAMP:
+        if (Value < behaviour.u.clamp.min)
+            Value = behaviour.u.clamp.min;
+        else if (Value > behaviour.u.clamp.max)
+            Value = behaviour.u.clamp.max;
+        break;
+    case NINE_STATE_BOOL:
+        Value = Value ? 1 : 0;
+        break;
+    case NINE_STATE_MASK:
+        Value = Value & behaviour.u.mask;
+        break;
+    case NINE_STATE_RANGE_DEF_VAL:
+        if (Value < behaviour.u.range_def_val.min || Value > behaviour.u.range_def_val.max)
+            Value = behaviour.u.range_def_val.default_val;
+        break;
+    }
+
+    return Value;
+}
+
+#endif /* _NINE_HELPERS_H_ */
diff --git a/src/gallium/state_trackers/nine/nine_pdata.h b/src/gallium/state_trackers/nine/nine_pdata.h
index 7bdd702cfbb..0e9a2aa7160 100644
--- a/src/gallium/state_trackers/nine/nine_pdata.h
+++ b/src/gallium/state_trackers/nine/nine_pdata.h
@@ -5,6 +5,7 @@
 struct pheader
 {
     boolean unknown;
+    GUID guid;
     DWORD size;
     char data[1];
 };
diff --git a/src/gallium/state_trackers/nine/nine_pipe.c b/src/gallium/state_trackers/nine/nine_pipe.c
index 2be30f7e097..27a10d64473 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.c
+++ b/src/gallium/state_trackers/nine/nine_pipe.c
@@ -181,6 +181,7 @@ nine_convert_blend_state(struct pipe_blend_state *blend_state, const DWORD *rs)
         }
         nine_convert_blend_state_fixup(&blend, rs); /* for BOTH[INV]SRCALPHA */
     }
+
     blend.rt[0].colormask = rs[D3DRS_COLORWRITEENABLE];
 
     if (rs[D3DRS_COLORWRITEENABLE1] != rs[D3DRS_COLORWRITEENABLE] ||
@@ -222,8 +223,8 @@ nine_convert_sampler_state(struct cso_context *ctx, int idx, const DWORD *ss)
     samp.wrap_s = d3dtextureaddress_to_pipe_tex_wrap(ss[D3DSAMP_ADDRESSU]);
     samp.wrap_t = d3dtextureaddress_to_pipe_tex_wrap(ss[D3DSAMP_ADDRESSV]);
     samp.wrap_r = d3dtextureaddress_to_pipe_tex_wrap(ss[D3DSAMP_ADDRESSW]);
-    samp.min_img_filter = ss[D3DSAMP_MINFILTER] == D3DTEXF_POINT ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
-    samp.mag_img_filter = ss[D3DSAMP_MAGFILTER] == D3DTEXF_POINT ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+    samp.min_img_filter = (ss[D3DSAMP_MINFILTER] == D3DTEXF_POINT && !ss[NINED3DSAMP_SHADOW]) ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+    samp.mag_img_filter = (ss[D3DSAMP_MAGFILTER] == D3DTEXF_POINT && !ss[NINED3DSAMP_SHADOW]) ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
     if (ss[D3DSAMP_MINFILTER] == D3DTEXF_ANISOTROPIC ||
         ss[D3DSAMP_MAGFILTER] == D3DTEXF_ANISOTROPIC)
         samp.max_anisotropy = ss[D3DSAMP_MAXANISOTROPY];
@@ -265,7 +266,7 @@ nine_pipe_context_clear(struct NineDevice9 *This)
 const enum pipe_format nine_d3d9_to_pipe_format_map[120] =
 {
    [D3DFMT_UNKNOWN]       = PIPE_FORMAT_NONE,
-   [D3DFMT_R8G8B8]        = PIPE_FORMAT_NONE,
+   [D3DFMT_R8G8B8]        = PIPE_FORMAT_R8G8B8_UNORM,
    [D3DFMT_A8R8G8B8]      = PIPE_FORMAT_B8G8R8A8_UNORM,
    [D3DFMT_X8R8G8B8]      = PIPE_FORMAT_B8G8R8X8_UNORM,
    [D3DFMT_R5G6B5]        = PIPE_FORMAT_B5G6R5_UNORM,
@@ -323,8 +324,8 @@ const enum pipe_format nine_d3d9_to_pipe_format_map[120] =
 const D3DFORMAT nine_pipe_to_d3d9_format_map[PIPE_FORMAT_COUNT] =
 {
    [PIPE_FORMAT_NONE]               = D3DFMT_UNKNOWN,
-
-/* [PIPE_FORMAT_B8G8R8_UNORM]       = D3DFMT_R8G8B8, */
+   /* TODO: rename PIPE_FORMAT_R8G8B8_UNORM to PIPE_FORMAT_B8G8R8_UNORM */
+   [PIPE_FORMAT_R8G8B8_UNORM]       = D3DFMT_R8G8B8,
    [PIPE_FORMAT_B8G8R8A8_UNORM]     = D3DFMT_A8R8G8B8,
    [PIPE_FORMAT_B8G8R8X8_UNORM]     = D3DFMT_X8R8G8B8,
    [PIPE_FORMAT_B5G6R5_UNORM]       = D3DFMT_R5G6B5,
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index ed431738abc..a7a7da27903 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -852,7 +852,12 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
             /* the address register (vs only) must be
              * assigned before use */
             assert(!ureg_dst_is_undef(tx->regs.a0));
-            ureg_ARR(ureg, tx->regs.address, ureg_src(tx->regs.a0));
+            /* Round to lowest for vs1.1 (contrary to the doc), else
+             * round to nearest */
+            if (tx->version.major < 2 && tx->version.minor < 2)
+                ureg_ARL(ureg, tx->regs.address, ureg_src(tx->regs.a0));
+            else
+                ureg_ARR(ureg, tx->regs.address, ureg_src(tx->regs.a0));
             src = ureg_src(tx->regs.address);
         } else {
             if (tx->version.major < 2 && tx->version.minor < 4) {
@@ -870,9 +875,12 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
         } else {
             if (tx->version.major < 3) {
                 assert(!param->rel);
-                src = ureg_DECL_fs_input(tx->ureg, TGSI_SEMANTIC_COLOR,
-                                         param->idx,
-                                         TGSI_INTERPOLATE_PERSPECTIVE);
+                src = ureg_DECL_fs_input_cyl_centroid(
+                    ureg, TGSI_SEMANTIC_COLOR, param->idx,
+                    TGSI_INTERPOLATE_COLOR, 0,
+                    tx->info->force_color_in_centroid ?
+                      TGSI_INTERPOLATE_LOC_CENTROID : 0,
+                    0, 1);
             } else {
                 assert(!param->rel); /* TODO */
                 assert(param->idx < Elements(tx->regs.v));
@@ -1163,12 +1171,9 @@ _tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
         assert(!param->rel);
         tx->info->rt_mask |= 1 << param->idx;
         if (ureg_dst_is_undef(tx->regs.oCol[param->idx])) {
-            /* ps < 3: oCol[0] will have fog blending afterward
-             * vs < 3: oD1.w (D3DPMISCCAPS_FOGANDSPECULARALPHA) set to 0 even if set */
+            /* ps < 3: oCol[0] will have fog blending afterward */
             if (!IS_VS && tx->version.major < 3 && param->idx == 0) {
                 tx->regs.oCol[0] = ureg_DECL_temporary(tx->ureg);
-            } else if (IS_VS && tx->version.major < 3 && param->idx == 1) {
-                tx->regs.oCol[1] = ureg_DECL_temporary(tx->ureg);
             } else {
                 tx->regs.oCol[param->idx] =
                     ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, param->idx);
@@ -1543,25 +1548,6 @@ DECL_SPECIAL(CALLNZ)
     return D3D_OK;
 }
 
-DECL_SPECIAL(MOV_vs1x)
-{
-    if (tx->insn.dst[0].file == D3DSPR_ADDR) {
-        /* Implementation note: We don't write directly
-         * to the addr register, but to an intermediate
-         * float register.
-         * Contrary to the doc, when writing to ADDR here,
-         * the rounding is not to nearest, but to lowest
-         * (wine test).
-         * Since we use ARR next, substract 0.5. */
-        ureg_SUB(tx->ureg,
-                 tx_dst_param(tx, &tx->insn.dst[0]),
-                 tx_src_param(tx, &tx->insn.src[0]),
-                 ureg_imm1f(tx->ureg, 0.5f));
-        return D3D_OK;
-    }
-    return NineTranslateInstruction_Generic(tx);
-}
-
 DECL_SPECIAL(LOOP)
 {
     struct ureg_program *ureg = tx->ureg;
@@ -1978,6 +1964,7 @@ nine_tgsi_to_interp_mode(struct tgsi_declaration_semantic *sem)
         return TGSI_INTERPOLATE_LINEAR;
     case TGSI_SEMANTIC_BCOLOR:
     case TGSI_SEMANTIC_COLOR:
+        return TGSI_INTERPOLATE_COLOR;
     case TGSI_SEMANTIC_FOG:
     case TGSI_SEMANTIC_GENERIC:
     case TGSI_SEMANTIC_TEXCOORD:
@@ -2058,13 +2045,17 @@ DECL_SPECIAL(DCL)
         }
     } else {
         if (is_input && tx->version.major >= 3) {
+            unsigned interp_location = 0;
             /* SM3 only, SM2 input semantic determined by file */
             assert(sem.reg.idx < Elements(tx->regs.v));
+            if (sem.reg.mod & NINED3DSPDM_CENTROID ||
+                (tgsi.Name == TGSI_SEMANTIC_COLOR && tx->info->force_color_in_centroid))
+                interp_location = TGSI_INTERPOLATE_LOC_CENTROID;
             tx->regs.v[sem.reg.idx] = ureg_DECL_fs_input_cyl_centroid(
                 ureg, tgsi.Name, tgsi.Index,
                 nine_tgsi_to_interp_mode(&tgsi),
                 0, /* cylwrap */
-                sem.reg.mod & NINED3DSPDM_CENTROID, 0, 1);
+                interp_location, 0, 1);
         } else
         if (!is_input && 0) { /* declare in COLOROUT/DEPTHOUT case */
             /* FragColor or FragDepth */
@@ -2736,8 +2727,7 @@ DECL_SPECIAL(COMMENT)
 struct sm1_op_info inst_table[] =
 {
     _OPI(NOP, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, NULL), /* 0 */
-    _OPI(MOV, MOV, V(0,0), V(1,1), V(0,0), V(0,0), 1, 1, SPECIAL(MOV_vs1x)),
-    _OPI(MOV, MOV, V(2,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL),
+    _OPI(MOV, MOV, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL),
     _OPI(ADD, ADD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 2 */
     _OPI(SUB, SUB, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 3 */
     _OPI(MAD, MAD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 3, NULL), /* 4 */
@@ -3426,13 +3416,6 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
         ureg_MOV(tx->ureg, ureg_writemask(tx->regs.oFog, TGSI_WRITEMASK_X), ureg_imm1f(tx->ureg, 0.0f));
     }
 
-    /* vs < 3: oD1.w (D3DPMISCCAPS_FOGANDSPECULARALPHA) set to 0 even if set */
-    if (IS_VS && tx->version.major < 3 && !ureg_dst_is_undef(tx->regs.oCol[1])) {
-        struct ureg_dst dst = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, 1);
-        ureg_MOV(tx->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oCol[1]));
-        ureg_MOV(tx->ureg, ureg_writemask(dst, TGSI_WRITEMASK_W), ureg_imm1f(tx->ureg, 0.0f));
-    }
-
     if (info->position_t)
         ureg_property(tx->ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
 
diff --git a/src/gallium/state_trackers/nine/nine_shader.h b/src/gallium/state_trackers/nine/nine_shader.h
index 41577ac572b..1fe0c4bd182 100644
--- a/src/gallium/state_trackers/nine/nine_shader.h
+++ b/src/gallium/state_trackers/nine/nine_shader.h
@@ -61,6 +61,7 @@ struct nine_shader_info
 
     uint8_t fog_enable;
     uint8_t fog_mode;
+    uint8_t force_color_in_centroid;
     uint16_t projected; /* ps 1.1 to 1.3 */
 
     unsigned const_i_base; /* in vec4 (16 byte) units */
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index aee31622088..6f94e378984 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -367,14 +367,14 @@ prepare_vs(struct NineDevice9 *device, uint8_t shader_changed)
     uint32_t changed_group = 0;
     int has_key_changed = 0;
 
-    if (likely(vs))
+    if (likely(state->programmable_vs))
         has_key_changed = NineVertexShader9_UpdateKey(vs, state);
 
     if (!shader_changed && !has_key_changed)
         return 0;
 
     /* likely because we dislike FF */
-    if (likely(vs)) {
+    if (likely(state->programmable_vs)) {
         state->cso.vs = NineVertexShader9_GetVariant(vs);
     } else {
         vs = device->ff.vs;
@@ -427,8 +427,8 @@ prepare_ps(struct NineDevice9 *device, uint8_t shader_changed)
 
 /* State preparation + State commit */
 
-static uint32_t
-update_framebuffer(struct NineDevice9 *device)
+static void
+update_framebuffer(struct NineDevice9 *device, bool is_clear)
 {
     struct pipe_context *pipe = device->pipe;
     struct nine_state *state = &device->state;
@@ -438,7 +438,8 @@ update_framebuffer(struct NineDevice9 *device)
     unsigned w = rt0->desc.Width;
     unsigned h = rt0->desc.Height;
     D3DMULTISAMPLE_TYPE nr_samples = rt0->desc.MultiSampleType;
-    unsigned mask = state->ps ? state->ps->rt_mask : 1;
+    unsigned ps_mask = state->ps ? state->ps->rt_mask : 1;
+    unsigned mask = is_clear ? 0xf : ps_mask;
     const int sRGB = state->rs[D3DRS_SRGBWRITEENABLE] ? 1 : 0;
 
     DBG("\n");
@@ -498,13 +499,13 @@ update_framebuffer(struct NineDevice9 *device)
 
     pipe->set_framebuffer_state(pipe, fb); /* XXX: cso ? */
 
-    return state->changed.group;
+    if (is_clear && state->rt_mask == ps_mask)
+        state->changed.group &= ~NINE_STATE_FB;
 }
 
 static void
 update_viewport(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
     const D3DVIEWPORT9 *vport = &device->state.viewport;
     struct pipe_viewport_state pvport;
 
@@ -543,7 +544,7 @@ update_viewport(struct NineDevice9 *device)
         pvport.translate[1] -= 1.0f / 128.0f;
     }
 
-    pipe->set_viewport_states(pipe, 0, 1, &pvport);
+    cso_set_viewport(device->cso, &pvport);
 }
 
 /* Loop through VS inputs and pick the vertex elements with the declared
@@ -567,7 +568,7 @@ update_vertex_elements(struct NineDevice9 *device)
     state->stream_usage_mask = 0;
     memset(vdecl_index_map, -1, 16);
     memset(used_streams, 0, device->caps.MaxStreams);
-    vs = device->state.vs ? device->state.vs : device->ff.vs;
+    vs = state->programmable_vs ? device->state.vs : device->ff.vs;
 
     if (vdecl) {
         for (n = 0; n < vs->num_inputs; ++n) {
@@ -761,7 +762,7 @@ update_textures_and_samplers(struct NineDevice9 *device)
         cso_single_sampler_done(device->cso, PIPE_SHADER_FRAGMENT);
 
     commit_samplers = FALSE;
-    sampler_mask = state->vs ? state->vs->sampler_mask : 0;
+    sampler_mask = state->programmable_vs ? state->vs->sampler_mask : 0;
     state->bound_samplers_mask_vs = 0;
     for (num_textures = 0, i = 0; i < NINE_MAX_SAMPLERS_VS; ++i) {
         const unsigned s = NINE_SAMPLER_VS(i);
@@ -854,7 +855,7 @@ commit_vs_constants(struct NineDevice9 *device)
 {
     struct pipe_context *pipe = device->pipe;
 
-    if (unlikely(!device->state.vs))
+    if (unlikely(!device->state.programmable_vs))
         pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs_ff);
     else
         pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs);
@@ -913,7 +914,8 @@ commit_ps(struct NineDevice9 *device)
     NINE_STATE_DSA |      \
     NINE_STATE_VIEWPORT | \
     NINE_STATE_VDECL |    \
-    NINE_STATE_IDXBUF)
+    NINE_STATE_IDXBUF |   \
+    NINE_STATE_STREAMFREQ)
 
 #define NINE_STATE_RARE      \
    (NINE_STATE_SCISSOR |     \
@@ -934,16 +936,14 @@ validate_textures(struct NineDevice9 *device)
 }
 
 void
-nine_update_state_framebuffer(struct NineDevice9 *device)
+nine_update_state_framebuffer_clear(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
 
     validate_textures(device);
 
     if (state->changed.group & NINE_STATE_FB)
-        update_framebuffer(device);
-
-    state->changed.group &= ~NINE_STATE_FB;
+        update_framebuffer(device, TRUE);
 }
 
 boolean
@@ -964,7 +964,7 @@ nine_update_state(struct NineDevice9 *device)
     validate_textures(device); /* may clobber state */
 
     /* ff_update may change VS/PS dirty bits */
-    if (unlikely(!state->vs || !state->ps))
+    if (unlikely(!state->programmable_vs || !state->ps))
         nine_ff_update(device);
     group = state->changed.group;
 
@@ -977,15 +977,14 @@ nine_update_state(struct NineDevice9 *device)
 
     if (group & (NINE_STATE_COMMON | NINE_STATE_VS)) {
         if (group & NINE_STATE_FB)
-            group |= update_framebuffer(device); /* may set NINE_STATE_RASTERIZER */
+            update_framebuffer(device, FALSE);
         if (group & NINE_STATE_BLEND)
             prepare_blend(device);
         if (group & NINE_STATE_DSA)
             prepare_dsa(device);
         if (group & NINE_STATE_VIEWPORT)
             update_viewport(device);
-        if ((group & (NINE_STATE_VDECL | NINE_STATE_VS)) ||
-            state->changed.stream_freq & ~1)
+        if (group & (NINE_STATE_VDECL | NINE_STATE_VS | NINE_STATE_STREAMFREQ))
             update_vertex_elements(device);
         if (group & NINE_STATE_IDXBUF)
             commit_index_buffer(device);
@@ -997,12 +996,12 @@ nine_update_state(struct NineDevice9 *device)
         if (group & (NINE_STATE_TEXTURE | NINE_STATE_SAMPLER))
             update_textures_and_samplers(device);
         if (device->prefer_user_constbuf) {
-            if ((group & (NINE_STATE_VS_CONST | NINE_STATE_VS)) && state->vs)
+            if ((group & (NINE_STATE_VS_CONST | NINE_STATE_VS)) && state->programmable_vs)
                 prepare_vs_constants_userbuf(device);
             if ((group & (NINE_STATE_PS_CONST | NINE_STATE_PS)) && state->ps)
                 prepare_ps_constants_userbuf(device);
         } else {
-            if ((group & NINE_STATE_VS_CONST) && state->vs)
+            if ((group & NINE_STATE_VS_CONST) && state->programmable_vs)
                 upload_constants(device, PIPE_SHADER_VERTEX);
             if ((group & NINE_STATE_PS_CONST) && state->ps)
                 upload_constants(device, PIPE_SHADER_FRAGMENT);
@@ -1262,6 +1261,8 @@ nine_state_set_defaults(struct NineDevice9 *device, const D3DCAPS9 *caps,
      */
     state->rs[D3DRS_POINTSIZE_MAX] = fui(caps->MaxPointSize);
 
+    memcpy(state->rs_advertised, state->rs, sizeof(state->rs));
+
     /* Set changed flags to initialize driver.
      */
     state->changed.group = NINE_STATE_ALL;
@@ -1314,8 +1315,10 @@ nine_state_clear(struct nine_state *state, const boolean device)
     nine_bind(&state->vs, NULL);
     nine_bind(&state->ps, NULL);
     nine_bind(&state->vdecl, NULL);
-    for (i = 0; i < PIPE_MAX_ATTRIBS; ++i)
+    for (i = 0; i < PIPE_MAX_ATTRIBS; ++i) {
         nine_bind(&state->stream[i], NULL);
+        pipe_resource_reference(&state->vtxbuf[i].buffer, NULL);
+    }
     nine_bind(&state->idxbuf, NULL);
     for (i = 0; i < NINE_MAX_SAMPLERS; ++i) {
         if (device &&
diff --git a/src/gallium/state_trackers/nine/nine_state.h b/src/gallium/state_trackers/nine/nine_state.h
index b34da70ef48..a4ec4e3b63a 100644
--- a/src/gallium/state_trackers/nine/nine_state.h
+++ b/src/gallium/state_trackers/nine/nine_state.h
@@ -61,23 +61,24 @@
 #define NINE_STATE_SAMPLER     (1 << 11)
 #define NINE_STATE_VDECL       (1 << 12)
 #define NINE_STATE_IDXBUF      (1 << 13)
-#define NINE_STATE_PRIM        (1 << 14)
-#define NINE_STATE_MATERIAL    (1 << 15)
-#define NINE_STATE_BLEND_COLOR (1 << 16)
-#define NINE_STATE_STENCIL_REF (1 << 17)
-#define NINE_STATE_SAMPLE_MASK (1 << 18)
-#define NINE_STATE_FF          (0x1f << 19)
-#define NINE_STATE_FF_VS       (0x17 << 19)
-#define NINE_STATE_FF_PS       (0x18 << 19)
-#define NINE_STATE_FF_LIGHTING (1 << 19)
-#define NINE_STATE_FF_MATERIAL (1 << 20)
-#define NINE_STATE_FF_VSTRANSF (1 << 21)
-#define NINE_STATE_FF_PSSTAGES (1 << 22)
-#define NINE_STATE_FF_OTHER    (1 << 23)
-#define NINE_STATE_FOG_SHADER  (1 << 24)
-#define NINE_STATE_PS1X_SHADER (1 << 25)
-#define NINE_STATE_ALL          0x3ffffff
-#define NINE_STATE_UNHANDLED   (1 << 26)
+#define NINE_STATE_STREAMFREQ  (1 << 14)
+#define NINE_STATE_PRIM        (1 << 15)
+#define NINE_STATE_MATERIAL    (1 << 16)
+#define NINE_STATE_BLEND_COLOR (1 << 17)
+#define NINE_STATE_STENCIL_REF (1 << 18)
+#define NINE_STATE_SAMPLE_MASK (1 << 19)
+#define NINE_STATE_FF          (0x1f << 20)
+#define NINE_STATE_FF_VS       (0x17 << 20)
+#define NINE_STATE_FF_PS       (0x18 << 20)
+#define NINE_STATE_FF_LIGHTING (1 << 20)
+#define NINE_STATE_FF_MATERIAL (1 << 21)
+#define NINE_STATE_FF_VSTRANSF (1 << 22)
+#define NINE_STATE_FF_PSSTAGES (1 << 23)
+#define NINE_STATE_FF_OTHER    (1 << 24)
+#define NINE_STATE_FOG_SHADER  (1 << 25)
+#define NINE_STATE_PS1X_SHADER (1 << 26)
+#define NINE_STATE_ALL          0x7ffffff
+#define NINE_STATE_UNHANDLED   (1 << 27)
 
 #define NINE_STATE_COMMIT_DSA  (1 << 0)
 #define NINE_STATE_COMMIT_RASTERIZER (1 << 1)
@@ -152,6 +153,7 @@ struct nine_state
     int    vs_const_i[NINE_MAX_CONST_I][4];
     BOOL   vs_const_b[NINE_MAX_CONST_B];
     float *vs_lconstf_temp;
+    BOOL programmable_vs;
 
     struct NinePixelShader9 *ps;
     float *ps_const_f;
@@ -179,6 +181,7 @@ struct nine_state
     uint8_t rt_mask;
 
     DWORD rs[NINED3DRS_COUNT];
+    DWORD rs_advertised[NINED3DRS_COUNT]; /* the ones apps get with GetRenderState */
 
     struct NineBaseTexture9 *texture[NINE_MAX_SAMPLERS]; /* PS, DMAP, VS */
 
@@ -236,7 +239,7 @@ extern const uint32_t nine_render_states_vertex[(NINED3DRS_COUNT + 31) / 32];
 
 struct NineDevice9;
 
-void nine_update_state_framebuffer(struct NineDevice9 *);
+void nine_update_state_framebuffer_clear(struct NineDevice9 *);
 boolean nine_update_state(struct NineDevice9 *);
 
 void nine_state_restore_non_cso(struct NineDevice9 *device);
diff --git a/src/gallium/state_trackers/nine/pixelshader9.c b/src/gallium/state_trackers/nine/pixelshader9.c
index 42bc349c2cc..00be67f8955 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.c
+++ b/src/gallium/state_trackers/nine/pixelshader9.c
@@ -160,6 +160,7 @@ NinePixelShader9_GetVariant( struct NinePixelShader9 *This )
         info.sampler_ps1xtypes = key;
         info.fog_enable = device->state.rs[D3DRS_FOGENABLE];
         info.fog_mode = device->state.rs[D3DRS_FOGTABLEMODE];
+        info.force_color_in_centroid = key >> 34 & 1;
         info.projected = (key >> 48) & 0xffff;
 
         hr = nine_translate_shader(This->base.device, &info);
diff --git a/src/gallium/state_trackers/nine/pixelshader9.h b/src/gallium/state_trackers/nine/pixelshader9.h
index e09009f6621..6b431813a81 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.h
+++ b/src/gallium/state_trackers/nine/pixelshader9.h
@@ -28,6 +28,7 @@
 #include "nine_state.h"
 #include "basetexture9.h"
 #include "nine_ff.h"
+#include "surface9.h"
 
 struct nine_lconstf;
 
@@ -92,6 +93,10 @@ NinePixelShader9_UpdateKey( struct NinePixelShader9 *ps,
         key |= ((uint64_t)state->rs[D3DRS_FOGTABLEMODE]) << 33;
     }
 
+    /* centroid interpolation automatically used for color ps inputs */
+    if (state->rt[0]->desc.MultiSampleType > 1)
+        key |= ((uint64_t)1) << 34;
+
     if (unlikely(ps->byte_code.version < 0x14)) {
         projected = nine_ff_get_projected_key(state);
         key |= ((uint64_t) projected) << 48;
diff --git a/src/gallium/state_trackers/nine/resource9.c b/src/gallium/state_trackers/nine/resource9.c
index 6d915338b24..b929c50a83c 100644
--- a/src/gallium/state_trackers/nine/resource9.c
+++ b/src/gallium/state_trackers/nine/resource9.c
@@ -29,12 +29,12 @@
 
 #include "util/u_hash_table.h"
 #include "util/u_inlines.h"
+#include "util/u_resource.h"
 
 #include "nine_pdata.h"
 
 #define DBG_CHANNEL DBG_RESOURCE
 
-
 HRESULT
 NineResource9_ctor( struct NineResource9 *This,
                     struct NineUnknownParams *pParams,
@@ -62,6 +62,33 @@ NineResource9_ctor( struct NineResource9 *This,
 
     if (Allocate) {
         assert(!initResource);
+
+        /* On Windows it is possible allocation fails when
+         * IDirect3DDevice9::GetAvailableTextureMem() still reports
+         * enough free space.
+         *
+         * Some games allocate surfaces
+         * in a loop until they receive D3DERR_OUTOFVIDEOMEMORY to measure
+         * the available texture memory size.
+         *
+         * We are not using the drivers VRAM statistics because:
+         *  * This would add overhead to each resource allocation.
+         *  * Freeing memory is lazy and takes some time, but applications
+         *    expects the memory counter to change immediately after allocating
+         *    or freeing memory.
+         *
+         * Vertexbuffers and indexbuffers are not accounted !
+         */
+        if (This->info.target != PIPE_BUFFER) {
+            This->size = util_resource_size(&This->info);
+
+            This->base.device->available_texture_mem -= This->size;
+            if (This->base.device->available_texture_mem <=
+                    This->base.device->available_texture_limit) {
+                return D3DERR_OUTOFVIDEOMEMORY;
+            }
+        }
+
         DBG("(%p) Creating pipe_resource.\n", This);
         This->resource = screen->resource_create(screen, &This->info);
         if (!This->resource)
@@ -92,6 +119,10 @@ NineResource9_dtor( struct NineResource9 *This )
      * still hold a reference. */
     pipe_resource_reference(&This->resource, NULL);
 
+    /* NOTE: size is 0, unless something has actually been allocated */
+    if (This->base.device)
+        This->base.device->available_texture_mem += This->size;
+
     NineUnknown_dtor(&This->base);
 }
 
@@ -117,9 +148,10 @@ NineResource9_SetPrivateData( struct NineResource9 *This,
     enum pipe_error err;
     struct pheader *header;
     const void *user_data = pData;
+    char guid_str[64];
 
-    DBG("This=%p refguid=%p pData=%p SizeOfData=%u Flags=%x\n",
-        This, refguid, pData, SizeOfData, Flags);
+    DBG("This=%p GUID=%s pData=%p SizeOfData=%u Flags=%x\n",
+        This, GUID_sprintf(guid_str, refguid), pData, SizeOfData, Flags);
 
     if (Flags & D3DSPD_IUNKNOWN)
         user_assert(SizeOfData == sizeof(IUnknown *), D3DERR_INVALIDCALL);
@@ -141,8 +173,9 @@ NineResource9_SetPrivateData( struct NineResource9 *This,
 
     header->size = SizeOfData;
     memcpy(header->data, user_data, header->size);
+    memcpy(&header->guid, refguid, sizeof(header->guid));
 
-    err = util_hash_table_set(This->pdata, refguid, header);
+    err = util_hash_table_set(This->pdata, &header->guid, header);
     if (err == PIPE_OK) {
         if (header->unknown) { IUnknown_AddRef(*(IUnknown **)header->data); }
         return D3D_OK;
@@ -162,9 +195,10 @@ NineResource9_GetPrivateData( struct NineResource9 *This,
 {
     struct pheader *header;
     DWORD sizeofdata;
+    char guid_str[64];
 
-    DBG("This=%p refguid=%p pData=%p pSizeOfData=%p\n",
-        This, refguid, pData, pSizeOfData);
+    DBG("This=%p GUID=%s pData=%p pSizeOfData=%p\n",
+        This, GUID_sprintf(guid_str, refguid), pData, pSizeOfData);
 
     header = util_hash_table_get(This->pdata, refguid);
     if (!header) { return D3DERR_NOTFOUND; }
@@ -191,8 +225,9 @@ NineResource9_FreePrivateData( struct NineResource9 *This,
                                REFGUID refguid )
 {
     struct pheader *header;
+    char guid_str[64];
 
-    DBG("This=%p refguid=%p\n", This, refguid);
+    DBG("This=%p GUID=%s\n", This, GUID_sprintf(guid_str, refguid));
 
     header = util_hash_table_get(This->pdata, refguid);
     if (!header)
diff --git a/src/gallium/state_trackers/nine/resource9.h b/src/gallium/state_trackers/nine/resource9.h
index 906f90806ce..8122257b7a7 100644
--- a/src/gallium/state_trackers/nine/resource9.h
+++ b/src/gallium/state_trackers/nine/resource9.h
@@ -45,6 +45,8 @@ struct NineResource9
 
     /* for [GS]etPrivateData/FreePrivateData */
     struct util_hash_table *pdata;
+
+    long long size;
 };
 static inline struct NineResource9 *
 NineResource9( void *data )
diff --git a/src/gallium/state_trackers/nine/stateblock9.c b/src/gallium/state_trackers/nine/stateblock9.c
index 6d6e1be0b7f..0d1a04b657a 100644
--- a/src/gallium/state_trackers/nine/stateblock9.c
+++ b/src/gallium/state_trackers/nine/stateblock9.c
@@ -24,6 +24,7 @@
 #include "device9.h"
 #include "basetexture9.h"
 #include "nine_helpers.h"
+#include "vertexdeclaration9.h"
 
 #define DBG_CHANNEL DBG_STATEBLOCK
 
@@ -179,6 +180,7 @@ nine_state_copy_common(struct nine_state *dst,
             const int r = ffs(m) - 1;
             m &= ~(1 << r);
             dst->rs[i * 32 + r] = src->rs[i * 32 + r];
+            dst->rs_advertised[i * 32 + r] = src->rs_advertised[i * 32 + r];
         }
     }
 
@@ -223,7 +225,7 @@ nine_state_copy_common(struct nine_state *dst,
                 nine_bind(&dst->stream[i], src->stream[i]);
                 if (src->stream[i]) {
                     dst->vtxbuf[i].buffer_offset = src->vtxbuf[i].buffer_offset;
-                    dst->vtxbuf[i].buffer = src->vtxbuf[i].buffer;
+                    pipe_resource_reference(&dst->vtxbuf[i].buffer, src->vtxbuf[i].buffer);
                     dst->vtxbuf[i].stride = src->vtxbuf[i].stride;
                 }
             }
@@ -269,6 +271,10 @@ nine_state_copy_common(struct nine_state *dst,
             dst->ff.light = REALLOC(dst->ff.light,
                                     dst->ff.num_lights * sizeof(D3DLIGHT9),
                                     mask->ff.num_lights * sizeof(D3DLIGHT9));
+            for (i = dst->ff.num_lights; i < mask->ff.num_lights; ++i) {
+                memset(&dst->ff.light[i], 0, sizeof(D3DLIGHT9));
+                dst->ff.light[i].Type = (D3DLIGHTTYPE)NINED3DLIGHT_INVALID;
+            }
             dst->ff.num_lights = mask->ff.num_lights;
         }
         for (i = 0; i < mask->ff.num_lights; ++i)
@@ -353,6 +359,7 @@ nine_state_copy_common_all(struct nine_state *dst,
 
     /* Render states. */
     memcpy(dst->rs, src->rs, sizeof(dst->rs));
+    memcpy(dst->rs_advertised, src->rs_advertised, sizeof(dst->rs_advertised));
     if (apply)
         memcpy(dst->changed.rs, src->changed.rs, sizeof(dst->changed.rs));
 
@@ -377,7 +384,7 @@ nine_state_copy_common_all(struct nine_state *dst,
             nine_bind(&dst->stream[i], src->stream[i]);
             if (src->stream[i]) {
                 dst->vtxbuf[i].buffer_offset = src->vtxbuf[i].buffer_offset;
-                dst->vtxbuf[i].buffer = src->vtxbuf[i].buffer;
+                pipe_resource_reference(&dst->vtxbuf[i].buffer, src->vtxbuf[i].buffer);
                 dst->vtxbuf[i].stride = src->vtxbuf[i].stride;
             }
             dst->stream_freq[i] = src->stream_freq[i];
@@ -486,7 +493,10 @@ NineStateBlock9_Apply( struct NineStateBlock9 *This )
         nine_state_copy_common(dst, src, src, TRUE, pool);
 
     if ((src->changed.group & NINE_STATE_VDECL) && src->vdecl)
-        nine_bind(&dst->vdecl, src->vdecl);
+        NineDevice9_SetVertexDeclaration(This->base.device, (IDirect3DVertexDeclaration9 *)src->vdecl);
+
+    /* Recomputing it is needed if we changed vs but not vdecl */
+    dst->programmable_vs = dst->vs && !(dst->vdecl && dst->vdecl->position_t);
 
     /* Textures */
     if (src->changed.texture) {
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index 14c1ce927ad..f88b75c3dd7 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -56,6 +56,9 @@ NineSurface9_ctor( struct NineSurface9 *This,
                    D3DSURFACE_DESC *pDesc )
 {
     HRESULT hr;
+    union pipe_color_union rgba = {0};
+    struct pipe_surface *surf;
+    struct pipe_context *pipe = pParams->device->pipe;
 
     DBG("This=%p pDevice=%p pResource=%p Level=%u Layer=%u pDesc=%p\n",
         This, pParams->device, pResource, Level, Layer, pDesc);
@@ -140,6 +143,12 @@ NineSurface9_ctor( struct NineSurface9 *This,
     if (pResource && NineSurface9_IsOffscreenPlain(This))
         pResource->flags |= NINE_RESOURCE_FLAG_LOCKABLE;
 
+    /* TODO: investigate what else exactly needs to be cleared */
+    if (This->base.resource && (pDesc->Usage & D3DUSAGE_RENDERTARGET)) {
+        surf = NineSurface9_GetSurface(This, 0);
+        pipe->clear_render_target(pipe, surf, &rgba, 0, 0, pDesc->Width, pDesc->Height);
+    }
+
     NineSurface9_Dump(This);
 
     return D3D_OK;
@@ -156,7 +165,7 @@ NineSurface9_dtor( struct NineSurface9 *This )
 
     /* Release system memory when we have to manage it (no parent) */
     if (!This->base.base.container && This->data)
-        FREE(This->data);
+        align_free(This->data);
     NineResource9_dtor(&This->base);
 }
 
@@ -348,7 +357,7 @@ NineSurface9_LockRect( struct NineSurface9 *This,
                 D3DERR_INVALIDCALL);
 
     if (pRect && This->desc.Pool == D3DPOOL_DEFAULT &&
-        compressed_format (This->desc.Format)) {
+        util_format_is_compressed(This->base.info.format)) {
         const unsigned w = util_format_get_blockwidth(This->base.info.format);
         const unsigned h = util_format_get_blockheight(This->base.info.format);
         user_assert((pRect->left == 0 && pRect->right == This->desc.Width &&
@@ -384,8 +393,8 @@ NineSurface9_LockRect( struct NineSurface9 *This,
          * and bpp 8, and the app has a workaround to work with the fact
          * that it is actually compressed. */
         if (is_ATI1_ATI2(This->base.info.format)) {
-            pLockedRect->Pitch = This->desc.Height;
-            pLockedRect->pBits = This->data + box.y * This->desc.Height + box.x;
+            pLockedRect->Pitch = This->desc.Width;
+            pLockedRect->pBits = This->data + box.y * This->desc.Width + box.x;
         } else {
             pLockedRect->Pitch = This->stride;
             pLockedRect->pBits = NineSurface9_GetSystemMemPointer(This,
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 3b1a7a4493c..82d4173fbb2 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -118,6 +118,14 @@ NineSwapChain9_Resize( struct NineSwapChain9 *This,
 
     DBG("This=%p pParams=%p\n", This, pParams);
     user_assert(pParams != NULL, E_POINTER);
+    user_assert(pParams->SwapEffect, D3DERR_INVALIDCALL);
+    user_assert((pParams->SwapEffect != D3DSWAPEFFECT_COPY) ||
+                (pParams->BackBufferCount <= 1), D3DERR_INVALIDCALL);
+    user_assert(pDevice->ex || pParams->BackBufferCount <= 3, D3DERR_INVALIDCALL);
+    user_assert(pDevice->ex ||
+                (pParams->SwapEffect == D3DSWAPEFFECT_FLIP) ||
+                (pParams->SwapEffect == D3DSWAPEFFECT_COPY) ||
+                (pParams->SwapEffect == D3DSWAPEFFECT_DISCARD), D3DERR_INVALIDCALL);
 
     DBG("pParams(%p):\n"
         "BackBufferWidth: %u\n"
@@ -145,11 +153,6 @@ NineSwapChain9_Resize( struct NineSwapChain9 *This,
         pParams->FullScreen_RefreshRateInHz,
         pParams->PresentationInterval);
 
-    if (pParams->SwapEffect == D3DSWAPEFFECT_COPY &&
-        pParams->BackBufferCount > 1) {
-        pParams->BackBufferCount = 1;
-    }
-
     if (pParams->BackBufferCount > 3) {
         pParams->BackBufferCount = 3;
     }
@@ -713,6 +716,10 @@ present( struct NineSwapChain9 *This,
         This->pipe->blit(This->pipe, &blit);
     }
 
+    /* The resource we present has to resolve fast clears
+     * if needed (and other things) */
+    This->pipe->flush_resource(This->pipe, resource);
+
     if (This->params.SwapEffect != D3DSWAPEFFECT_DISCARD)
         handle_draw_cursor_and_hud(This, resource);
 
@@ -738,12 +745,6 @@ bypass_rendering:
             return D3DERR_WASSTILLDRAWING;
     }
 
-    if (This->present_buffers)
-        resource = This->present_buffers[0];
-    else
-        resource = This->buffers[0]->base.resource;
-    This->pipe->flush_resource(This->pipe, resource);
-
     if (!This->enable_threadpool) {
         This->tasks[0]=NULL;
         fence = swap_fences_pop_front(This);
@@ -786,6 +787,19 @@ NineSwapChain9_Present( struct NineSwapChain9 *This,
     if (hr == D3DERR_WASSTILLDRAWING)
         return hr;
 
+    if (This->base.device->ex) {
+        if (NineSwapChain9_GetOccluded(This)) {
+            return S_PRESENT_OCCLUDED;
+        }
+    } else {
+        if (NineSwapChain9_GetOccluded(This)) {
+            This->base.device->device_needs_reset = TRUE;
+        }
+        if (This->base.device->device_needs_reset) {
+            return D3DERR_DEVICELOST;
+        }
+    }
+
     switch (This->params.SwapEffect) {
         case D3DSWAPEFFECT_FLIP:
             UNTESTED(4);
@@ -840,7 +854,6 @@ NineSwapChain9_Present( struct NineSwapChain9 *This,
     ID3DPresent_WaitBufferReleased(This->present, This->present_handles[0]);
 
     This->base.device->state.changed.group |= NINE_STATE_FB;
-    nine_update_state_framebuffer(This->base.device);
 
     return hr;
 }
@@ -907,8 +920,9 @@ NineSwapChain9_GetBackBuffer( struct NineSwapChain9 *This,
     DBG("GetBackBuffer: This=%p iBackBuffer=%d Type=%d ppBackBuffer=%p\n",
         This, iBackBuffer, Type, ppBackBuffer);
     (void)user_error(Type == D3DBACKBUFFER_TYPE_MONO);
+    /* don't touch ppBackBuffer on error */
+    user_assert(ppBackBuffer != NULL, D3DERR_INVALIDCALL);
     user_assert(iBackBuffer < This->params.BackBufferCount, D3DERR_INVALIDCALL);
-    user_assert(ppBackBuffer != NULL, E_POINTER);
 
     NineUnknown_AddRef(NineUnknown(This->buffers[iBackBuffer]));
     *ppBackBuffer = (IDirect3DSurface9 *)This->buffers[iBackBuffer];
@@ -990,3 +1004,13 @@ NineSwapChain9_new( struct NineDevice9 *pDevice,
                           implicit, pPresent, pPresentationParameters,
                           pCTX, hFocusWindow, NULL);
 }
+
+BOOL
+NineSwapChain9_GetOccluded( struct NineSwapChain9 *This )
+{
+    if (This->base.device->minor_version_num > 0) {
+        return ID3DPresent_GetWindowOccluded(This->present);
+    }
+
+    return FALSE;
+}
diff --git a/src/gallium/state_trackers/nine/swapchain9.h b/src/gallium/state_trackers/nine/swapchain9.h
index 5e48dde5004..4bd74f7b6ec 100644
--- a/src/gallium/state_trackers/nine/swapchain9.h
+++ b/src/gallium/state_trackers/nine/swapchain9.h
@@ -139,4 +139,7 @@ HRESULT WINAPI
 NineSwapChain9_GetPresentParameters( struct NineSwapChain9 *This,
                                      D3DPRESENT_PARAMETERS *pPresentationParameters );
 
+BOOL
+NineSwapChain9_GetOccluded( struct NineSwapChain9 *This );
+
 #endif /* _NINE_SWAPCHAIN9_H_ */
diff --git a/src/gallium/state_trackers/nine/texture9.c b/src/gallium/state_trackers/nine/texture9.c
index bc325c1335e..ada08cea90a 100644
--- a/src/gallium/state_trackers/nine/texture9.c
+++ b/src/gallium/state_trackers/nine/texture9.c
@@ -235,7 +235,7 @@ NineTexture9_dtor( struct NineTexture9 *This )
     }
 
     if (This->managed_buffer)
-        FREE(This->managed_buffer);
+        align_free(This->managed_buffer);
 
     NineBaseTexture9_dtor(&This->base);
 }
diff --git a/src/gallium/state_trackers/nine/vertexbuffer9.c b/src/gallium/state_trackers/nine/vertexbuffer9.c
index 8e2eaaf8ff9..10311b428fe 100644
--- a/src/gallium/state_trackers/nine/vertexbuffer9.c
+++ b/src/gallium/state_trackers/nine/vertexbuffer9.c
@@ -39,56 +39,13 @@ NineVertexBuffer9_ctor( struct NineVertexBuffer9 *This,
                         struct NineUnknownParams *pParams,
                         D3DVERTEXBUFFER_DESC *pDesc )
 {
-    struct pipe_resource *info = &This->base.info;
     HRESULT hr;
 
     DBG("This=%p Size=0x%x Usage=%x Pool=%u\n", This,
         pDesc->Size, pDesc->Usage, pDesc->Pool);
 
-    user_assert(pDesc->Pool != D3DPOOL_SCRATCH, D3DERR_INVALIDCALL);
-
-    This->maps = MALLOC(sizeof(struct pipe_transfer *));
-    if (!This->maps)
-        return E_OUTOFMEMORY;
-    This->nmaps = 0;
-    This->maxmaps = 1;
-
-    This->pipe = pParams->device->pipe;
-
-    info->screen = pParams->device->screen;
-    info->target = PIPE_BUFFER;
-    info->format = PIPE_FORMAT_R8_UNORM;
-    info->width0 = pDesc->Size;
-    info->flags = 0;
-
-    info->bind = PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_TRANSFER_WRITE;
-    if (!(pDesc->Usage & D3DUSAGE_WRITEONLY))
-        info->bind |= PIPE_BIND_TRANSFER_READ;
-
-    info->usage = PIPE_USAGE_DEFAULT;
-    if (pDesc->Usage & D3DUSAGE_DYNAMIC)
-        info->usage = PIPE_USAGE_STREAM;
-    if (pDesc->Pool == D3DPOOL_SYSTEMMEM)
-        info->usage = PIPE_USAGE_STAGING;
-
-    /* if (pDesc->Usage & D3DUSAGE_DONOTCLIP) { } */
-    /* if (pDesc->Usage & D3DUSAGE_NONSECURE) { } */
-    /* if (pDesc->Usage & D3DUSAGE_NPATCHES) { } */
-    /* if (pDesc->Usage & D3DUSAGE_POINTS) { } */
-    /* if (pDesc->Usage & D3DUSAGE_RTPATCHES) { } */
-    if (pDesc->Usage & D3DUSAGE_SOFTWAREPROCESSING)
-        DBG("Application asked for Software Vertex Processing, "
-            "but this is unimplemented\n");
-    /* if (pDesc->Usage & D3DUSAGE_TEXTAPI) { } */
-
-    info->height0 = 1;
-    info->depth0 = 1;
-    info->array_size = 1;
-    info->last_level = 0;
-    info->nr_samples = 0;
-
-    hr = NineResource9_ctor(&This->base, pParams, NULL, TRUE,
-                            D3DRTYPE_VERTEXBUFFER, pDesc->Pool, pDesc->Usage);
+    hr = NineBuffer9_ctor(&This->base, pParams, D3DRTYPE_VERTEXBUFFER,
+                          pDesc->Usage, pDesc->Size, pDesc->Pool);
     if (FAILED(hr))
         return hr;
 
@@ -102,85 +59,29 @@ NineVertexBuffer9_ctor( struct NineVertexBuffer9 *This,
 void
 NineVertexBuffer9_dtor( struct NineVertexBuffer9 *This )
 {
-    if (This->maps) {
-        while (This->nmaps) {
-            NineVertexBuffer9_Unlock(This);
-        }
-        FREE(This->maps);
-    }
+    NineBuffer9_dtor(&This->base);
+}
 
-    NineResource9_dtor(&This->base);
+struct pipe_resource *
+NineVertexBuffer9_GetResource( struct NineVertexBuffer9 *This )
+{
+    return NineBuffer9_GetResource(&This->base);
 }
 
 HRESULT WINAPI
 NineVertexBuffer9_Lock( struct NineVertexBuffer9 *This,
-                        UINT OffsetToLock,
-                        UINT SizeToLock,
-                        void **ppbData,
-                        DWORD Flags )
+                       UINT OffsetToLock,
+                       UINT SizeToLock,
+                       void **ppbData,
+                       DWORD Flags )
 {
-    struct pipe_box box;
-    void *data;
-    const unsigned usage = d3dlock_buffer_to_pipe_transfer_usage(Flags);
-
-    DBG("This=%p(pipe=%p) OffsetToLock=0x%x, SizeToLock=0x%x, Flags=0x%x\n",
-        This, This->base.resource,
-        OffsetToLock, SizeToLock, Flags);
-
-    user_assert(ppbData, E_POINTER);
-    user_assert(!(Flags & ~(D3DLOCK_DISCARD |
-                            D3DLOCK_DONOTWAIT |
-                            D3DLOCK_NO_DIRTY_UPDATE |
-                            D3DLOCK_NOSYSLOCK |
-                            D3DLOCK_READONLY |
-                            D3DLOCK_NOOVERWRITE)), D3DERR_INVALIDCALL);
-
-    if (This->nmaps == This->maxmaps) {
-        struct pipe_transfer **newmaps =
-            REALLOC(This->maps, sizeof(struct pipe_transfer *)*This->maxmaps,
-                    sizeof(struct pipe_transfer *)*(This->maxmaps << 1));
-        if (newmaps == NULL)
-            return E_OUTOFMEMORY;
-
-        This->maxmaps <<= 1;
-        This->maps = newmaps;
-    }
-
-    if (SizeToLock == 0) {
-        SizeToLock = This->desc.Size - OffsetToLock;
-        user_warn(OffsetToLock != 0);
-    }
-
-    u_box_1d(OffsetToLock, SizeToLock, &box);
-
-    data = This->pipe->transfer_map(This->pipe, This->base.resource, 0,
-                                    usage, &box, &This->maps[This->nmaps]);
-    if (!data) {
-        DBG("pipe::transfer_map failed\n"
-            " usage = %x\n"
-            " box.x = %u\n"
-            " box.width = %u\n",
-            usage, box.x, box.width);
-        /* not sure what to return, msdn suggests this */
-        if (Flags & D3DLOCK_DONOTWAIT)
-            return D3DERR_WASSTILLDRAWING;
-        return D3DERR_INVALIDCALL;
-    }
-
-    This->nmaps++;
-    *ppbData = data;
-
-    return D3D_OK;
+    return NineBuffer9_Lock(&This->base, OffsetToLock, SizeToLock, ppbData, Flags);
 }
 
 HRESULT WINAPI
 NineVertexBuffer9_Unlock( struct NineVertexBuffer9 *This )
 {
-    DBG("This=%p\n", This);
-
-    user_assert(This->nmaps > 0, D3DERR_INVALIDCALL);
-    This->pipe->transfer_unmap(This->pipe, This->maps[--(This->nmaps)]);
-    return D3D_OK;
+    return NineBuffer9_Unlock(&This->base);
 }
 
 HRESULT WINAPI
diff --git a/src/gallium/state_trackers/nine/vertexbuffer9.h b/src/gallium/state_trackers/nine/vertexbuffer9.h
index 6174de4df08..859402b925b 100644
--- a/src/gallium/state_trackers/nine/vertexbuffer9.h
+++ b/src/gallium/state_trackers/nine/vertexbuffer9.h
@@ -22,8 +22,8 @@
 
 #ifndef _NINE_VERTEXBUFFER9_H_
 #define _NINE_VERTEXBUFFER9_H_
-
 #include "resource9.h"
+#include "buffer9.h"
 
 struct pipe_screen;
 struct pipe_context;
@@ -31,13 +31,10 @@ struct pipe_transfer;
 
 struct NineVertexBuffer9
 {
-    struct NineResource9 base;
+    struct NineBuffer9 base;
 
     /* G3D */
     struct pipe_context *pipe;
-    struct pipe_transfer **maps;
-    int nmaps, maxmaps;
-
     D3DVERTEXBUFFER_DESC desc;
 };
 static inline struct NineVertexBuffer9 *
@@ -58,6 +55,12 @@ NineVertexBuffer9_ctor( struct NineVertexBuffer9 *This,
 
 void
 NineVertexBuffer9_dtor( struct NineVertexBuffer9 *This );
+/*** Nine private ***/
+
+struct pipe_resource *
+NineVertexBuffer9_GetResource( struct NineVertexBuffer9 *This );
+
+/*** Direct3D public ***/
 
 HRESULT WINAPI
 NineVertexBuffer9_Lock( struct NineVertexBuffer9 *This,
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.c b/src/gallium/state_trackers/nine/vertexdeclaration9.c
index 2047b91abc4..36c594b5be3 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.c
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.c
@@ -174,24 +174,24 @@ NineVertexDeclaration9_ctor( struct NineVertexDeclaration9 *This,
                              const D3DVERTEXELEMENT9 *pElements )
 {
     const D3DCAPS9 *caps;
-    unsigned i;
-
+    unsigned i, nelems;
     DBG("This=%p pParams=%p pElements=%p\n", This, pParams, pElements);
 
+    /* wine */
+    for (nelems = 0;
+         pElements[nelems].Stream != 0xFF;
+         ++nelems) {
+        user_assert(pElements[nelems].Type != D3DDECLTYPE_UNUSED, E_FAIL);
+        user_assert(!(pElements[nelems].Offset & 3), E_FAIL);
+    }
+
+    caps = NineDevice9_GetCaps(pParams->device);
+    user_assert(nelems <= caps->MaxStreams, D3DERR_INVALIDCALL);
+
     HRESULT hr = NineUnknown_ctor(&This->base, pParams);
     if (FAILED(hr)) { return hr; }
 
-    /* wine */
-    for (This->nelems = 0;
-         pElements[This->nelems].Stream != 0xFF;
-         ++This->nelems) {
-        user_assert(pElements[This->nelems].Type != D3DDECLTYPE_UNUSED, E_FAIL);
-        user_assert(!(pElements[This->nelems].Offset & 3), E_FAIL);
-    }
-
-    caps = NineDevice9_GetCaps(This->base.device);
-    user_assert(This->nelems <= caps->MaxStreams, D3DERR_INVALIDCALL);
-
+    This->nelems = nelems;
     This->decls = CALLOC(This->nelems+1, sizeof(D3DVERTEXELEMENT9));
     This->elems = CALLOC(This->nelems, sizeof(struct pipe_vertex_element));
     This->usage_map = CALLOC(This->nelems, sizeof(uint16_t));
@@ -203,6 +203,9 @@ NineVertexDeclaration9_ctor( struct NineVertexDeclaration9 *This,
                                                      This->decls[i].UsageIndex);
         This->usage_map[i] = usage;
 
+        if (This->decls[i].Usage == D3DDECLUSAGE_POSITIONT)
+            This->position_t = TRUE;
+
         This->elems[i].src_offset = This->decls[i].Offset;
         This->elems[i].instance_divisor = 0;
         This->elems[i].vertex_buffer_index = This->decls[i].Stream;
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.h b/src/gallium/state_trackers/nine/vertexdeclaration9.h
index 655bcfbf165..e39f259440f 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.h
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.h
@@ -46,6 +46,8 @@ struct NineVertexDeclaration9
 
     D3DVERTEXELEMENT9 *decls;
     DWORD fvf;
+
+    BOOL position_t;
 };
 static inline struct NineVertexDeclaration9 *
 NineVertexDeclaration9( void *data )
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index 0b9005685a9..f6988923caa 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -136,7 +136,7 @@ NineVolume9_dtor( struct NineVolume9 *This )
         NineVolume9_UnlockBox(This);
 
     if (This->data)
-           FREE(This->data);
+           align_free(This->data);
 
     pipe_resource_reference(&This->resource, NULL);
 
@@ -264,6 +264,13 @@ NineVolume9_LockBox( struct NineVolume9 *This,
         usage |= PIPE_TRANSFER_DONTBLOCK;
 
     if (pBox) {
+        user_assert(pBox->Right > pBox->Left, D3DERR_INVALIDCALL);
+        user_assert(pBox->Bottom > pBox->Top, D3DERR_INVALIDCALL);
+        user_assert(pBox->Back > pBox->Front, D3DERR_INVALIDCALL);
+        user_assert(pBox->Right <= This->desc.Width, D3DERR_INVALIDCALL);
+        user_assert(pBox->Bottom <= This->desc.Height, D3DERR_INVALIDCALL);
+        user_assert(pBox->Back <= This->desc.Depth, D3DERR_INVALIDCALL);
+
         d3dbox_to_pipe_box(&box, pBox);
         if (u_box_clip_2d(&box, &box, This->desc.Width, This->desc.Height) < 0) {
             DBG("Locked volume intersection empty.\n");
diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c
index f66ed896e62..b4536828909 100644
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -35,6 +35,7 @@
 #include "util/u_memory.h"
 #include "util/u_video.h"
 #include "vl/vl_rbsp.h"
+#include "vl/vl_zscan.h"
 
 #include "entrypoint.h"
 #include "vid_dec.h"
@@ -205,6 +206,7 @@ static void scaling_list(struct vl_rbsp *rbsp, uint8_t *scalingList, unsigned si
                          const uint8_t *defaultList, const uint8_t *fallbackList)
 {
    unsigned lastScale = 8, nextScale = 8;
+   const int *list;
    unsigned i;
 
    /* (pic|seq)_scaling_list_present_flag[i] */
@@ -214,6 +216,7 @@ static void scaling_list(struct vl_rbsp *rbsp, uint8_t *scalingList, unsigned si
       return;
    }
 
+   list = (sizeOfScalingList == 16) ? vl_zscan_normal_16 : vl_zscan_normal;
    for (i = 0; i < sizeOfScalingList; ++i ) {
 
       if (nextScale != 0) {
@@ -224,8 +227,8 @@ static void scaling_list(struct vl_rbsp *rbsp, uint8_t *scalingList, unsigned si
             return;
          }
       }
-      scalingList[i] = nextScale == 0 ? lastScale : nextScale;
-      lastScale = scalingList[i];
+      scalingList[list[i]] = nextScale == 0 ? lastScale : nextScale;
+      lastScale = scalingList[list[i]];
    }
 }
 
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index 5cd1ba7815c..233db8ae372 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -53,22 +53,29 @@ DRI_CONF_BEGIN
          DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_1)
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_NINE
+        DRI_CONF_NINE_OVERRIDEVENDOR(-1)
         DRI_CONF_NINE_THROTTLE(-2)
         DRI_CONF_NINE_THREADSUBMIT("false")
     DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-/* define fallback value here: NVIDIA GeForce GTX 970 */
-#define FALLBACK_NAME "NV124"
-#define FALLBACK_DEVID 0x13C2
-#define FALLBACK_VENID 0x10de
+struct fallback_card_config {
+    const char *name;
+    unsigned vendor_id;
+    unsigned device_id;
+} fallback_cards[] = {
+        {"NV124", 0x10de, 0x13C2}, /* NVIDIA GeForce GTX 970 */
+        {"HAWAII", 0x1002, 0x67b1}, /* AMD Radeon R9 290 */
+        {"Haswell Mobile", 0x8086, 0x13C2}, /* Intel Haswell Mobile */
+        {"SVGA3D", 0x15ad, 0x0405}, /* VMware SVGA 3D */
+};
 
 /* prototypes */
 void
 d3d_match_vendor_id( D3DADAPTER_IDENTIFIER9* drvid,
-		unsigned fallback_ven,
-		unsigned fallback_dev,
-		const char* fallback_name );
+                     unsigned fallback_ven,
+                     unsigned fallback_dev,
+                     const char* fallback_name );
 
 void d3d_fill_driver_version(D3DADAPTER_IDENTIFIER9* drvid);
 
@@ -118,9 +125,9 @@ get_bus_info( int fd,
         *subsysid = 0;
         *revision = 0;
     } else {
-        DBG("Unable to detect card. Faking %s\n", FALLBACK_NAME);
-        *vendorid = FALLBACK_VENID;
-        *deviceid = FALLBACK_DEVID;
+        DBG("Unable to detect card. Faking %s\n", fallback_cards[0].name);
+        *vendorid = fallback_cards[0].vendor_id;
+        *deviceid = fallback_cards[0].device_id;
         *subsysid = 0;
         *revision = 0;
     }
@@ -128,8 +135,10 @@ get_bus_info( int fd,
 
 static inline void
 read_descriptor( struct d3dadapter9_context *ctx,
-                 int fd )
+                 int fd, int override_vendorid )
 {
+    unsigned i;
+    BOOL found;
     D3DADAPTER_IDENTIFIER9 *drvid = &ctx->identifier;
 
     memset(drvid, 0, sizeof(*drvid));
@@ -140,9 +149,30 @@ read_descriptor( struct d3dadapter9_context *ctx,
     strncpy(drvid->Description, ctx->hal->get_name(ctx->hal),
                  sizeof(drvid->Description));
 
+    if (override_vendorid > 0) {
+        found = FALSE;
+        /* fill in device_id and card name for fake vendor */
+        for (i = 0; i < sizeof(fallback_cards)/sizeof(fallback_cards[0]); i++) {
+            if (fallback_cards[i].vendor_id == override_vendorid) {
+                DBG("Faking card '%s' vendor 0x%04x, device 0x%04x\n",
+                        fallback_cards[i].name,
+                        fallback_cards[i].vendor_id,
+                        fallback_cards[i].device_id);
+                drvid->VendorId = fallback_cards[i].vendor_id;
+                drvid->DeviceId = fallback_cards[i].device_id;
+                strncpy(drvid->Description, fallback_cards[i].name,
+                             sizeof(drvid->Description));
+                found = TRUE;
+                break;
+            }
+        }
+        if (!found) {
+            DBG("Unknown fake vendor 0x%04x! Using detected vendor !\n", override_vendorid);
+        }
+    }
     /* choose fall-back vendor if necessary to allow
      * the following functions to return sane results */
-    d3d_match_vendor_id(drvid, FALLBACK_VENID, FALLBACK_DEVID, FALLBACK_NAME);
+    d3d_match_vendor_id(drvid, fallback_cards[0].vendor_id, fallback_cards[0].device_id, fallback_cards[0].name);
     /* fill in driver name and version info */
     d3d_fill_driver_version(drvid);
     /* override Description field with Windows like names */
@@ -177,6 +207,7 @@ drm_create_adapter( int fd,
     driOptionCache defaultInitOptions;
     driOptionCache userInitOptions;
     int throttling_value_user = -2;
+    int override_vendorid = -1;
 
     if (!ctx) { return E_OUTOFMEMORY; }
 
@@ -247,6 +278,10 @@ drm_create_adapter( int fd,
                 "You should not expect any benefit.");
     }
 
+    if (driCheckOption(&userInitOptions, "override_vendorid", DRI_INT)) {
+        override_vendorid = driQueryOptioni(&userInitOptions, "override_vendorid");
+    }
+
     driDestroyOptionCache(&userInitOptions);
     driDestroyOptionInfo(&defaultInitOptions);
 
@@ -260,7 +295,7 @@ drm_create_adapter( int fd,
     }
 
     /* read out PCI info */
-    read_descriptor(&ctx->base, fd);
+    read_descriptor(&ctx->base, fd, override_vendorid);
 
     /* create and return new ID3DAdapter9 */
     hr = NineAdapter9_new(&ctx->base, (struct NineAdapter9 **)ppAdapter);
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index d4030852943..2a7738e6979 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -94,7 +94,7 @@ gallium_DRIVERS += libmesa_winsys_vc4 libmesa_pipe_vc4
 endif
 ifneq ($(filter virgl,$(MESA_GPU_DRIVERS)),)
 LOCAL_CFLAGS += -DGALLIUM_VIRGL
-gallium_DRIVERS += libmesa_winsys_virgl libmesa_pipe_virgl
+gallium_DRIVERS += libmesa_winsys_virgl libmesa_winsys_virgl_vtest libmesa_pipe_virgl
 endif
 ifneq ($(filter vmwgfx,$(MESA_GPU_DRIVERS)),)
 gallium_DRIVERS += libmesa_winsys_svga libmesa_pipe_svga
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 30a1aa8d6ba..59a801b1426 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -288,16 +288,17 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
    request.alloc_size = size;
    request.phys_alignment = alignment;
 
-   if (initial_domain & RADEON_DOMAIN_VRAM) {
+   if (initial_domain & RADEON_DOMAIN_VRAM)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
-      if (flags & RADEON_FLAG_CPU_ACCESS)
-         request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-   }
-   if (initial_domain & RADEON_DOMAIN_GTT) {
+   if (initial_domain & RADEON_DOMAIN_GTT)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
-      if (flags & RADEON_FLAG_GTT_WC)
-         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
-   }
+
+   if (flags & RADEON_FLAG_CPU_ACCESS)
+      request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+   if (flags & RADEON_FLAG_NO_CPU_ACCESS)
+      request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
+   if (flags & RADEON_FLAG_GTT_WC)
+      request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 
    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
    if (r) {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 7393a1d1eb4..dab27dfba96 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -68,7 +68,6 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
 
    switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
    case CIK__PIPE_CONFIG__ADDR_SURF_P2:
-   default:
        return 2;
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
@@ -86,23 +85,13 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
    case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
        return 16;
+   default:
+       fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n");
+       assert(!"this should never occur");
+       return 2;
    }
 }
 
-/* Convert Sea Islands register values GB_ADDR_CFG and MC_ADDR_CFG
- * into GB_TILING_CONFIG register which is only present on R600-R700. */
-static unsigned r600_get_gb_tiling_config(struct amdgpu_gpu_info *info)
-{
-   unsigned num_pipes = info->gb_addr_cfg & 0x7;
-   unsigned num_banks = info->mc_arb_ramcfg & 0x3;
-   unsigned pipe_interleave_bytes = (info->gb_addr_cfg >> 4) & 0x7;
-   unsigned row_size = (info->gb_addr_cfg >> 28) & 0x3;
-
-   return num_pipes | (num_banks << 4) |
-         (pipe_interleave_bytes << 8) |
-         (row_size << 12);
-}
-
 /* Helper function to do the ioctls needed for setup and init. */
 static boolean do_winsys_init(struct amdgpu_winsys *ws)
 {
@@ -251,20 +240,19 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
    ws->info.gart_size = gtt.heap_size;
    ws->info.vram_size = vram.heap_size;
    /* convert the shader clock from KHz to MHz */
-   ws->info.max_sclk = ws->amdinfo.max_engine_clk / 1000;
+   ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000;
    ws->info.max_se = ws->amdinfo.num_shader_engines;
    ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine;
    ws->info.has_uvd = uvd.available_rings != 0;
    ws->info.vce_fw_version =
          vce.available_rings ? vce_version : 0;
    ws->info.has_userptr = TRUE;
-   ws->info.r600_num_backends = ws->amdinfo.rb_pipes;
-   ws->info.r600_clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
-   ws->info.r600_tiling_config = r600_get_gb_tiling_config(&ws->amdinfo);
-   ws->info.r600_num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo);
-   ws->info.r600_max_pipes = ws->amdinfo.max_quad_shader_pipes; /* TODO: is this correct? */
-   ws->info.r600_virtual_address = TRUE;
-   ws->info.r600_has_dma = dma.available_rings != 0;
+   ws->info.num_render_backends = ws->amdinfo.rb_pipes;
+   ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
+   ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo);
+   ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7);
+   ws->info.has_virtual_memory = TRUE;
+   ws->info.has_sdma = dma.available_rings != 0;
 
    /* Get the number of good compute units. */
    ws->info.num_good_compute_units = 0;
@@ -276,7 +264,7 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
    memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode,
           sizeof(ws->amdinfo.gb_tile_mode));
    ws->info.si_tile_mode_array_valid = TRUE;
-   ws->info.si_backend_enabled_mask = ws->amdinfo.enabled_rb_pipes_mask;
+   ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask;
 
    memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode,
           sizeof(ws->amdinfo.gb_macro_tile_mode));
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 3ec6a065c7d..7e9ed0ca0fe 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -281,7 +281,7 @@ void radeon_bo_destroy(struct pb_buffer *_buf)
     if (bo->ptr)
         os_munmap(bo->ptr, bo->base.size);
 
-    if (rws->info.r600_virtual_address) {
+    if (rws->info.has_virtual_memory) {
         if (rws->va_unmap_working) {
             struct drm_radeon_gem_va va;
 
@@ -552,7 +552,7 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
     pipe_mutex_init(bo->map_mutex);
     pb_cache_init_entry(&rws->bo_cache, &bo->cache_entry, &bo->base);
 
-    if (rws->info.r600_virtual_address) {
+    if (rws->info.has_virtual_memory) {
         struct drm_radeon_gem_va va;
 
         bo->va = radeon_bomgr_find_va(rws, size, alignment);
@@ -834,7 +834,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
 
     pipe_mutex_unlock(ws->bo_handles_mutex);
 
-    if (ws->info.r600_virtual_address) {
+    if (ws->info.has_virtual_memory) {
         struct drm_radeon_gem_va va;
 
         bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);
@@ -966,7 +966,7 @@ done:
     if (stride)
         *stride = whandle->stride;
 
-    if (ws->info.r600_virtual_address && !bo->va) {
+    if (ws->info.has_virtual_memory && !bo->va) {
         struct drm_radeon_gem_va va;
 
         bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 085071c381c..155a13008a4 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -283,7 +283,7 @@ static unsigned radeon_add_buffer(struct radeon_drm_cs *cs,
          * This doesn't have to be done if virtual memory is enabled,
          * because there is no offset patching with virtual memory.
          */
-        if (cs->base.ring_type != RING_DMA || cs->ws->info.r600_virtual_address) {
+        if (cs->base.ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
             return i;
         }
     }
@@ -540,7 +540,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
             cs->cst->flags[0] = 0;
             cs->cst->flags[1] = RADEON_CS_RING_DMA;
             cs->cst->cs.num_chunks = 3;
-            if (cs->ws->info.r600_virtual_address) {
+            if (cs->ws->info.has_virtual_memory) {
                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
             }
             break;
@@ -567,7 +567,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                 cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
                 cs->cst->cs.num_chunks = 3;
             }
-            if (cs->ws->info.r600_virtual_address) {
+            if (cs->ws->info.has_virtual_memory) {
                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
                 cs->cst->cs.num_chunks = 3;
             }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 8a1ed3ae08c..35dc7e69dcf 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -298,10 +298,10 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
     }
 
     /* Check for dma */
-    ws->info.r600_has_dma = FALSE;
+    ws->info.has_sdma = FALSE;
     /* DMA is disabled on R700. There is IB corruption and hangs. */
     if (ws->info.chip_class >= EVERGREEN && ws->info.drm_minor >= 27) {
-        ws->info.r600_has_dma = TRUE;
+        ws->info.has_sdma = TRUE;
     }
 
     /* Check for UVD and VCE */
@@ -351,11 +351,11 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
 
     /* Get max clock frequency info and convert it to MHz */
     radeon_get_drm_value(ws->fd, RADEON_INFO_MAX_SCLK, NULL,
-                         &ws->info.max_sclk);
-    ws->info.max_sclk /= 1000;
+                         &ws->info.max_shader_clock);
+    ws->info.max_shader_clock /= 1000;
 
     radeon_get_drm_value(ws->fd, RADEON_INFO_SI_BACKEND_ENABLED_MASK, NULL,
-                         &ws->info.si_backend_enabled_mask);
+                         &ws->info.enabled_rb_mask);
 
     ws->num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 
@@ -372,51 +372,72 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
             return FALSE;
     }
     else if (ws->gen >= DRV_R600) {
+        uint32_t tiling_config = 0;
+
         if (ws->info.drm_minor >= 9 &&
             !radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_BACKENDS,
                                   "num backends",
-                                  &ws->info.r600_num_backends))
+                                  &ws->info.num_render_backends))
             return FALSE;
 
         /* get the GPU counter frequency, failure is not fatal */
         radeon_get_drm_value(ws->fd, RADEON_INFO_CLOCK_CRYSTAL_FREQ, NULL,
-                             &ws->info.r600_clock_crystal_freq);
+                             &ws->info.clock_crystal_freq);
 
         radeon_get_drm_value(ws->fd, RADEON_INFO_TILING_CONFIG, NULL,
-                             &ws->info.r600_tiling_config);
+                             &tiling_config);
+
+        ws->info.r600_num_banks =
+            ws->info.chip_class >= EVERGREEN ?
+                4 << ((tiling_config & 0xf0) >> 4) :
+                4 << ((tiling_config & 0x30) >> 4);
+
+        ws->info.pipe_interleave_bytes =
+            ws->info.chip_class >= EVERGREEN ?
+                256 << ((tiling_config & 0xf00) >> 8) :
+                256 << ((tiling_config & 0xc0) >> 6);
+
+        if (!ws->info.pipe_interleave_bytes)
+            ws->info.pipe_interleave_bytes =
+                ws->info.chip_class >= EVERGREEN ? 512 : 256;
 
         if (ws->info.drm_minor >= 11) {
             radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_TILE_PIPES, NULL,
-                                 &ws->info.r600_num_tile_pipes);
+                                 &ws->info.num_tile_pipes);
 
             if (radeon_get_drm_value(ws->fd, RADEON_INFO_BACKEND_MAP, NULL,
-                                      &ws->info.r600_backend_map))
-                ws->info.r600_backend_map_valid = TRUE;
+                                      &ws->info.r600_gb_backend_map))
+                ws->info.r600_gb_backend_map_valid = TRUE;
+        } else {
+            ws->info.num_tile_pipes =
+                ws->info.chip_class >= EVERGREEN ?
+                    1 << (tiling_config & 0xf) :
+                    1 << ((tiling_config & 0xe) >> 1);
         }
 
-        ws->info.r600_virtual_address = FALSE;
+        ws->info.has_virtual_memory = FALSE;
         if (ws->info.drm_minor >= 13) {
             uint32_t ib_vm_max_size;
 
-            ws->info.r600_virtual_address = TRUE;
+            ws->info.has_virtual_memory = TRUE;
             if (!radeon_get_drm_value(ws->fd, RADEON_INFO_VA_START, NULL,
                                       &ws->va_start))
-                ws->info.r600_virtual_address = FALSE;
+                ws->info.has_virtual_memory = FALSE;
             if (!radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
                                       &ib_vm_max_size))
-                ws->info.r600_virtual_address = FALSE;
+                ws->info.has_virtual_memory = FALSE;
             radeon_get_drm_value(ws->fd, RADEON_INFO_VA_UNMAP_WORKING, NULL,
                                  &ws->va_unmap_working);
         }
 	if (ws->gen == DRV_R600 && !debug_get_bool_option("RADEON_VA", FALSE))
-		ws->info.r600_virtual_address = FALSE;
+		ws->info.has_virtual_memory = FALSE;
     }
 
     /* Get max pipes, this is only needed for compute shaders.  All evergreen+
      * chips have at least 2 pipes, so we use 2 as a default. */
-    ws->info.r600_max_pipes = 2;
+    ws->info.r600_max_quad_pipes = 2;
     radeon_get_drm_value(ws->fd, RADEON_INFO_MAX_PIPES, NULL,
-                         &ws->info.r600_max_pipes);
+                         &ws->info.r600_max_quad_pipes);
 
     /* All GPUs have at least one compute unit */
     ws->info.num_good_compute_units = 1;
@@ -742,7 +763,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     ws->fd = dup(fd);
 
     if (!do_winsys_init(ws))
-        goto fail;
+        goto fail1;
 
     pb_cache_init(&ws->bo_cache, 500000, 2.0f, 0,
                   MIN2(ws->info.vram_size, ws->info.gart_size),
@@ -812,8 +833,9 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     return &ws->base;
 
 fail:
-    pipe_mutex_unlock(fd_tab_mutex);
     pb_cache_deinit(&ws->bo_cache);
+fail1:
+    pipe_mutex_unlock(fd_tab_mutex);
     if (ws->surf_man)
         radeon_surface_manager_free(ws->surf_man);
     if (ws->fd >= 0)
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_public.h b/src/gallium/winsys/virgl/drm/virgl_drm_public.h
index be01021ca9a..f70f0e50448 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_public.h
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_public.h
@@ -23,8 +23,8 @@
 #ifndef VIRGL_DRM_PUBLIC_H
 #define VIRGL_DRM_PUBLIC_H
 
-struct virgl_winsys;
+struct pipe_screen;
 
-struct virgl_winsys *virgl_drm_winsys_create(int drmFD);
+struct pipe_screen *virgl_drm_screen_create(int fd);
 
 #endif
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index b5d4435e5e6..ba009882ec2 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -25,6 +25,7 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/ioctl.h>
+#include <sys/stat.h>
 
 #include "os/os_mman.h"
 #include "os/os_time.h"
@@ -33,6 +34,8 @@
 #include "util/u_hash_table.h"
 #include "util/u_inlines.h"
 #include "state_tracker/drm_driver.h"
+#include "virgl/virgl_screen.h"
+#include "virgl/virgl_public.h"
 
 #include <xf86drm.h>
 #include "virtgpu_drm.h"
@@ -50,10 +53,17 @@ static void virgl_hw_res_destroy(struct virgl_drm_winsys *qdws,
 {
       struct drm_gem_close args;
 
-      if (res->name) {
+      if (res->flinked) {
+         pipe_mutex_lock(qdws->bo_handles_mutex);
+         util_hash_table_remove(qdws->bo_names,
+                                (void *)(uintptr_t)res->flink);
+         pipe_mutex_unlock(qdws->bo_handles_mutex);
+      }
+
+      if (res->bo_handle) {
          pipe_mutex_lock(qdws->bo_handles_mutex);
          util_hash_table_remove(qdws->bo_handles,
-                                (void *)(uintptr_t)res->name);
+                                (void *)(uintptr_t)res->bo_handle);
          pipe_mutex_unlock(qdws->bo_handles_mutex);
       }
 
@@ -109,6 +119,7 @@ virgl_drm_winsys_destroy(struct virgl_winsys *qws)
    virgl_cache_flush(qdws);
 
    util_hash_table_destroy(qdws->bo_handles);
+   util_hash_table_destroy(qdws->bo_names);
    pipe_mutex_destroy(qdws->bo_handles_mutex);
    pipe_mutex_destroy(qdws->mutex);
 
@@ -367,11 +378,12 @@ virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws,
    struct drm_gem_open open_arg = {};
    struct drm_virtgpu_resource_info info_arg = {};
    struct virgl_hw_res *res;
+   uint32_t handle = whandle->handle;
 
    pipe_mutex_lock(qdws->bo_handles_mutex);
 
    if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
-      res = util_hash_table_get(qdws->bo_handles, (void*)(uintptr_t)whandle->handle);
+      res = util_hash_table_get(qdws->bo_names, (void*)(uintptr_t)handle);
       if (res) {
          struct virgl_hw_res *r = NULL;
          virgl_drm_resource_reference(qdws, &r, res);
@@ -379,21 +391,31 @@ virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws,
       }
    }
 
+   if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+      int r;
+      r = drmPrimeFDToHandle(qdws->fd, whandle->handle, &handle);
+      if (r) {
+         res = NULL;
+         goto done;
+      }
+   }
+
+   res = util_hash_table_get(qdws->bo_handles, (void*)(uintptr_t)handle);
+   fprintf(stderr, "resource %p for handle %d, pfd=%d\n", res, handle, whandle->handle);
+   if (res) {
+      struct virgl_hw_res *r = NULL;
+      virgl_drm_resource_reference(qdws, &r, res);
+      goto done;
+   }
+
    res = CALLOC_STRUCT(virgl_hw_res);
    if (!res)
       goto done;
 
    if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
-      int r;
-      uint32_t handle;
-      r = drmPrimeFDToHandle(qdws->fd, whandle->handle, &handle);
-      if (r) {
-         FREE(res);
-         res = NULL;
-         goto done;
-      }
       res->bo_handle = handle;
    } else {
+      fprintf(stderr, "gem open handle %d\n", handle);
       memset(&open_arg, 0, sizeof(open_arg));
       open_arg.name = whandle->handle;
       if (drmIoctl(qdws->fd, DRM_IOCTL_GEM_OPEN, &open_arg)) {
@@ -403,7 +425,7 @@ virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws,
       }
       res->bo_handle = open_arg.handle;
    }
-   res->name = whandle->handle;
+   res->name = handle;
 
    memset(&info_arg, 0, sizeof(info_arg));
    info_arg.bo_handle = res->bo_handle;
@@ -422,7 +444,7 @@ virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws,
    pipe_reference_init(&res->reference, 1);
    res->num_cs_references = 0;
 
-   util_hash_table_set(qdws->bo_handles, (void *)(uintptr_t)whandle->handle, res);
+   util_hash_table_set(qdws->bo_handles, (void *)(uintptr_t)handle, res);
 
 done:
    pipe_mutex_unlock(qdws->bo_handles_mutex);
@@ -452,7 +474,7 @@ static boolean virgl_drm_winsys_resource_get_handle(struct virgl_winsys *qws,
          res->flink = flink.name;
 
          pipe_mutex_lock(qdws->bo_handles_mutex);
-         util_hash_table_set(qdws->bo_handles, (void *)(uintptr_t)res->flink, res);
+         util_hash_table_set(qdws->bo_names, (void *)(uintptr_t)res->flink, res);
          pipe_mutex_unlock(qdws->bo_handles_mutex);
       }
       whandle->handle = res->flink;
@@ -732,7 +754,7 @@ static void virgl_fence_reference(struct virgl_winsys *vws,
 }
 
 
-struct virgl_winsys *
+static struct virgl_winsys *
 virgl_drm_winsys_create(int drmFD)
 {
    struct virgl_drm_winsys *qdws;
@@ -748,6 +770,7 @@ virgl_drm_winsys_create(int drmFD)
    pipe_mutex_init(qdws->mutex);
    pipe_mutex_init(qdws->bo_handles_mutex);
    qdws->bo_handles = util_hash_table_create(handle_hash, handle_compare);
+   qdws->bo_names = util_hash_table_create(handle_hash, handle_compare);
    qdws->base.destroy = virgl_drm_winsys_destroy;
 
    qdws->base.transfer_put = virgl_bo_transfer_put;
@@ -772,3 +795,87 @@ virgl_drm_winsys_create(int drmFD)
    return &qdws->base;
 
 }
+
+static struct util_hash_table *fd_tab = NULL;
+pipe_static_mutex(virgl_screen_mutex);
+
+static void
+virgl_drm_screen_destroy(struct pipe_screen *pscreen)
+{
+   struct virgl_screen *screen = virgl_screen(pscreen);
+   boolean destroy;
+
+   pipe_mutex_lock(virgl_screen_mutex);
+   destroy = --screen->refcnt == 0;
+   if (destroy) {
+      int fd = virgl_drm_winsys(screen->vws)->fd;
+      util_hash_table_remove(fd_tab, intptr_to_pointer(fd));
+   }
+   pipe_mutex_unlock(virgl_screen_mutex);
+
+   if (destroy) {
+      pscreen->destroy = screen->winsys_priv;
+      pscreen->destroy(pscreen);
+   }
+}
+
+static unsigned hash_fd(void *key)
+{
+   int fd = pointer_to_intptr(key);
+   struct stat stat;
+   fstat(fd, &stat);
+
+   return stat.st_dev ^ stat.st_ino ^ stat.st_rdev;
+}
+
+static int compare_fd(void *key1, void *key2)
+{
+   int fd1 = pointer_to_intptr(key1);
+   int fd2 = pointer_to_intptr(key2);
+   struct stat stat1, stat2;
+   fstat(fd1, &stat1);
+   fstat(fd2, &stat2);
+
+   return stat1.st_dev != stat2.st_dev ||
+         stat1.st_ino != stat2.st_ino ||
+         stat1.st_rdev != stat2.st_rdev;
+}
+
+struct pipe_screen *
+virgl_drm_screen_create(int fd)
+{
+   struct pipe_screen *pscreen = NULL;
+
+   pipe_mutex_lock(virgl_screen_mutex);
+   if (!fd_tab) {
+      fd_tab = util_hash_table_create(hash_fd, compare_fd);
+      if (!fd_tab)
+         goto unlock;
+   }
+
+   pscreen = util_hash_table_get(fd_tab, intptr_to_pointer(fd));
+   if (pscreen) {
+      virgl_screen(pscreen)->refcnt++;
+   } else {
+      struct virgl_winsys *vws;
+      int dup_fd = dup(fd);
+
+      vws = virgl_drm_winsys_create(dup_fd);
+
+      pscreen = virgl_create_screen(vws);
+      if (pscreen) {
+         util_hash_table_set(fd_tab, intptr_to_pointer(dup_fd), pscreen);
+
+         /* Bit of a hack, to avoid circular linkage dependency,
+          * ie. pipe driver having to call in to winsys, we
+          * override the pipe drivers screen->destroy():
+          */
+         virgl_screen(pscreen)->winsys_priv = pscreen->destroy;
+         pscreen->destroy = virgl_drm_screen_destroy;
+      }
+   }
+
+unlock:
+   pipe_mutex_unlock(virgl_screen_mutex);
+   return pscreen;
+}
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h
index da85ff87d2a..ffd7658ca81 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h
@@ -62,6 +62,7 @@ struct virgl_drm_winsys
    pipe_mutex mutex;
 
    struct util_hash_table *bo_handles;
+   struct util_hash_table *bo_names;
    pipe_mutex bo_handles_mutex;
 };
 
diff --git a/src/gallium/winsys/virgl/vtest/Android.mk b/src/gallium/winsys/virgl/vtest/Android.mk
new file mode 100644
index 00000000000..3e084e44ceb
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/Android.mk
@@ -0,0 +1,33 @@
+# Copyright (C) 2014 Emil Velikov <emil.l.velikov@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_MODULE := libmesa_winsys_virgl_vtest
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/glx/dri2_glx.c b/src/glx/dri2_glx.c
index 651915aed71..77103492a4f 100644
--- a/src/glx/dri2_glx.c
+++ b/src/glx/dri2_glx.c
@@ -1102,9 +1102,14 @@ dri2BindExtensions(struct dri2_screen *psc, struct glx_display * priv,
       __glXEnableDirectExtension(&psc->base, "GLX_ARB_create_context");
       __glXEnableDirectExtension(&psc->base, "GLX_ARB_create_context_profile");
 
-      if ((mask & (1 << __DRI_API_GLES2)) != 0)
-	 __glXEnableDirectExtension(&psc->base,
-				    "GLX_EXT_create_context_es2_profile");
+      if ((mask & ((1 << __DRI_API_GLES) |
+                   (1 << __DRI_API_GLES2) |
+                   (1 << __DRI_API_GLES3))) != 0) {
+         __glXEnableDirectExtension(&psc->base,
+                                    "GLX_EXT_create_context_es_profile");
+         __glXEnableDirectExtension(&psc->base,
+                                    "GLX_EXT_create_context_es2_profile");
+      }
    }
 
    for (i = 0; extensions[i]; i++) {
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index 8bdbb9caf56..6054ffc3dc1 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -665,9 +665,14 @@ dri3_bind_extensions(struct dri3_screen *psc, struct glx_display * priv,
    __glXEnableDirectExtension(&psc->base, "GLX_ARB_create_context");
    __glXEnableDirectExtension(&psc->base, "GLX_ARB_create_context_profile");
 
-   if ((mask & (1 << __DRI_API_GLES2)) != 0)
+   if ((mask & ((1 << __DRI_API_GLES) |
+                (1 << __DRI_API_GLES2) |
+                (1 << __DRI_API_GLES3))) != 0) {
+      __glXEnableDirectExtension(&psc->base,
+                                 "GLX_EXT_create_context_es_profile");
       __glXEnableDirectExtension(&psc->base,
                                  "GLX_EXT_create_context_es2_profile");
+   }
 
    for (i = 0; extensions[i]; i++) {
       /* when on a different gpu than the server, the server pixmaps
diff --git a/src/glx/dri_common.c b/src/glx/dri_common.c
index 8a56385c4bd..6728d38fa0a 100644
--- a/src/glx/dri_common.c
+++ b/src/glx/dri_common.c
@@ -547,9 +547,18 @@ dri2_convert_glx_attribs(unsigned num_attribs, const uint32_t *attribs,
       case GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB:
 	 *api = __DRI_API_OPENGL;
 	 break;
-      case GLX_CONTEXT_ES2_PROFILE_BIT_EXT:
-	 *api = __DRI_API_GLES2;
-	 break;
+      case GLX_CONTEXT_ES_PROFILE_BIT_EXT:
+         if (*major_ver >= 3)
+            *api = __DRI_API_GLES3;
+         else if (*major_ver == 2 && *minor_ver == 0)
+            *api = __DRI_API_GLES2;
+         else if (*major_ver == 1 && *minor_ver < 2)
+            *api = __DRI_API_GLES;
+         else {
+            *error = __DRI_CTX_ERROR_BAD_API;
+            return false;
+         }
+         break;
       default:
 	 *error = __DRI_CTX_ERROR_BAD_API;
 	 return false;
@@ -580,19 +589,6 @@ dri2_convert_glx_attribs(unsigned num_attribs, const uint32_t *attribs,
       return false;
    }
 
-   /* The GLX_EXT_create_context_es2_profile spec says:
-    *
-    *     "... If the version requested is 2.0, and the
-    *     GLX_CONTEXT_ES2_PROFILE_BIT_EXT bit is set in the
-    *     GLX_CONTEXT_PROFILE_MASK_ARB attribute (see below), then the context
-    *     returned will implement OpenGL ES 2.0. This is the only way in which
-    *     an implementation may request an OpenGL ES 2.0 context."
-    */
-   if (*api == __DRI_API_GLES2 && (*major_ver != 2 || *minor_ver != 0)) {
-      *error = __DRI_CTX_ERROR_BAD_API;
-      return false;
-   }
-
    *error = __DRI_CTX_ERROR_SUCCESS;
    return true;
 }
diff --git a/src/glx/drisw_glx.c b/src/glx/drisw_glx.c
index 76cc3214b7b..241ac7f6d2c 100644
--- a/src/glx/drisw_glx.c
+++ b/src/glx/drisw_glx.c
@@ -623,8 +623,10 @@ driswBindExtensions(struct drisw_screen *psc, const __DRIextension **extensions)
       __glXEnableDirectExtension(&psc->base, "GLX_ARB_create_context");
       __glXEnableDirectExtension(&psc->base, "GLX_ARB_create_context_profile");
 
-      /* DRISW version >= 2 implies support for OpenGL ES 2.0.
+      /* DRISW version >= 2 implies support for OpenGL ES.
        */
+      __glXEnableDirectExtension(&psc->base,
+				 "GLX_EXT_create_context_es_profile");
       __glXEnableDirectExtension(&psc->base,
 				 "GLX_EXT_create_context_es2_profile");
    }
diff --git a/src/glx/glxextensions.c b/src/glx/glxextensions.c
index 3b29aef1234..22b078ce484 100644
--- a/src/glx/glxextensions.c
+++ b/src/glx/glxextensions.c
@@ -146,6 +146,7 @@ static const struct extension_info known_glx_extensions[] = {
    { GLX(EXT_fbconfig_packed_float),   VER(0,0), Y, Y, N, N },
    { GLX(EXT_framebuffer_sRGB),        VER(0,0), Y, Y, N, N },
    { GLX(EXT_create_context_es2_profile), VER(0,0), Y, N, N, N },
+   { GLX(EXT_create_context_es_profile), VER(0,0), Y, N, N, N },
    { GLX(MESA_copy_sub_buffer),        VER(0,0), Y, N, N, N },
    { GLX(MESA_multithread_makecurrent),VER(0,0), Y, N, Y, N },
    { GLX(MESA_query_renderer),         VER(0,0), Y, N, N, Y },
diff --git a/src/glx/glxextensions.h b/src/glx/glxextensions.h
index 3a9bc823052..906b3fc16c0 100644
--- a/src/glx/glxextensions.h
+++ b/src/glx/glxextensions.h
@@ -45,6 +45,7 @@ enum
    EXT_import_context_bit,
    EXT_framebuffer_sRGB_bit,
    EXT_fbconfig_packed_float_bit,
+   EXT_create_context_es_profile_bit,
    EXT_create_context_es2_profile_bit,
    MESA_copy_sub_buffer_bit,
    MESA_depth_float_bit,
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index d7ab3bff4df..db98ac05fd9 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8247,7 +8247,14 @@
 
 <xi:include href="ARB_multi_bind.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions 148 - 153 -->
+<category name="GL_ARB_query_buffer_object" number="148">
+    <enum name="QUERY_RESULT_NO_WAIT"               value="0x9194"/>
+    <enum name="QUERY_BUFFER"                       value="0x9192"/>
+    <enum name="QUERY_BUFFER_BINDING"               value="0x9193"/>
+    <enum name="QUERY_BUFFER_BARRIER_BIT"           value="0x00008000"/>
+</category>
+
+<!-- ARB extensions 149 - 153 -->
 
 <xi:include href="ARB_indirect_parameters.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
@@ -12661,6 +12668,12 @@
     <enum name="FRAMEBUFFER_SRGB_CAPABLE_EXT"      value="0x8DBA"/>
 </category>
 
+<category name="GL_ATI_meminfo" number="359">
+    <enum name="VBO_FREE_MEMORY_ATI"          value="0x87FB" />
+    <enum name="TEXTURE_FREE_MEMORY_ATI"      value="0x87FC" />
+    <enum name="RENDERBUFFER_FREE_MEMORY_ATI" value="0x87FD" />
+</category>
+
 <xi:include href="AMD_performance_monitor.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <category name="GL_APPLE_texture_range" number="367">
@@ -12714,6 +12727,14 @@
     <enum name="EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD"   value="0x9160"/>
 </category>
 
+<category name="GL_NVX_gpu_memory_info" number="438">
+    <enum name="GPU_MEMORY_INFO_DEDICATED_VIDMEM_NVX"         value="0x9047" />
+    <enum name="GPU_MEMORY_INFO_TOTAL_AVAILABLE_MEMORY_NVX"   value="0x9048" />
+    <enum name="GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX" value="0x9049" />
+    <enum name="GPU_MEMORY_INFO_EVICTION_COUNT_NVX"           value="0x904A" />
+    <enum name="GPU_MEMORY_INFO_EVICTED_MEMORY_NVX"           value="0x904B" />
+</category>
+
 <xi:include href="INTEL_performance_query.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <category name="GL_EXT_polygon_offset_clamp" number="460">
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 7af8becd607..ffe560faa3d 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -377,6 +377,7 @@ VBO_FILES = \
 	vbo/vbo_exec_eval.c \
 	vbo/vbo_exec.h \
 	vbo/vbo.h \
+	vbo/vbo_minmax_index.c \
 	vbo/vbo_noop.c \
 	vbo/vbo_noop.h \
 	vbo/vbo_primitive_restart.c \
@@ -393,6 +394,7 @@ VBO_FILES = \
 
 STATETRACKER_FILES = \
 	state_tracker/st_atom_array.c \
+	state_tracker/st_atom_atomicbuf.c \
 	state_tracker/st_atom_blend.c \
 	state_tracker/st_atom.c \
 	state_tracker/st_atom_clip.c \
@@ -409,6 +411,7 @@ STATETRACKER_FILES = \
 	state_tracker/st_atom_shader.c \
 	state_tracker/st_atom_shader.h \
 	state_tracker/st_atom_stipple.c \
+	state_tracker/st_atom_storagebuf.c \
 	state_tracker/st_atom_tess.c \
 	state_tracker/st_atom_texture.c \
 	state_tracker/st_atom_viewport.c \
diff --git a/src/mesa/drivers/dri/common/xmlpool/t_options.h b/src/mesa/drivers/dri/common/xmlpool/t_options.h
index 55e926b239e..e5cbc465871 100644
--- a/src/mesa/drivers/dri/common/xmlpool/t_options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/t_options.h
@@ -363,3 +363,8 @@ DRI_CONF_OPT_END
 DRI_CONF_OPT_BEGIN_B(thread_submit, def) \
         DRI_CONF_DESC(en,gettext("Use an additional thread to submit buffers.")) \
 DRI_CONF_OPT_END
+
+#define DRI_CONF_NINE_OVERRIDEVENDOR(def) \
+DRI_CONF_OPT_BEGIN(override_vendorid, int, def) \
+        DRI_CONF_DESC(en,"Define the vendor_id to report. This allows faking another hardware vendor.") \
+DRI_CONF_OPT_END
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index 0401e397031..00e44af2f8d 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -23,7 +23,7 @@
 
 #include "brw_compiler.h"
 #include "brw_context.h"
-#include "nir.h"
+#include "compiler/nir/nir.h"
 #include "main/errors.h"
 #include "util/debug.h"
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 1032e5a8175..44d2fe4d9e4 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -166,6 +166,19 @@ intel_viewport(struct gl_context *ctx)
    }
 }
 
+static void
+intel_update_framebuffer(struct gl_context *ctx,
+                         struct gl_framebuffer *fb)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   /* Quantize the derived default number of samples
+    */
+   fb->DefaultGeometry._NumSamples =
+      intel_quantize_num_samples(brw->intelScreen,
+                                 fb->DefaultGeometry.NumSamples);
+}
+
 static void
 intel_update_state(struct gl_context * ctx, GLuint new_state)
 {
@@ -245,6 +258,12 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
    }
 
    _mesa_lock_context_textures(ctx);
+
+   if (new_state & _NEW_BUFFERS) {
+      intel_update_framebuffer(ctx, ctx->DrawBuffer);
+      if (ctx->DrawBuffer != ctx->ReadBuffer)
+         intel_update_framebuffer(ctx, ctx->ReadBuffer);
+   }
 }
 
 #define flushFront(screen)      ((screen)->image.loader ? (screen)->image.loader->flushFrontBuffer : (screen)->dri2.loader->flushFrontBuffer)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index 994c699bb5a..d7a1456bce0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -268,7 +268,7 @@ fs_visitor::opt_combine_constants()
       qsort(table.imm, table.len, sizeof(struct imm), compare);
 
    /* Insert MOVs to load the constant values into GRFs. */
-   fs_reg reg(VGRF, alloc.allocate(dispatch_width / 8));
+   fs_reg reg(VGRF, alloc.allocate(1));
    reg.stride = 0;
    for (int i = 0; i < table.len; i++) {
       struct imm *imm = &table.imm[i];
@@ -284,8 +284,8 @@ fs_visitor::opt_combine_constants()
       imm->subreg_offset = reg.subreg_offset;
 
       reg.subreg_offset += sizeof(float);
-      if ((unsigned)reg.subreg_offset == dispatch_width * sizeof(float)) {
-         reg.nr = alloc.allocate(dispatch_width / 8);
+      if ((unsigned)reg.subreg_offset == 8 * sizeof(float)) {
+         reg.nr = alloc.allocate(1);
          reg.subreg_offset = 0;
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 6c3a8d70677..cd7f3fe851a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1144,16 +1144,16 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
-   case nir_op_extract_ubyte:
-   case nir_op_extract_ibyte: {
+   case nir_op_extract_u8:
+   case nir_op_extract_i8: {
       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
       bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
                result, op[0], brw_imm_ud(byte->u[0]));
       break;
    }
 
-   case nir_op_extract_uword:
-   case nir_op_extract_iword: {
+   case nir_op_extract_u16:
+   case nir_op_extract_i16: {
       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
       bld.emit(SHADER_OPCODE_EXTRACT_WORD,
                result, op[0], brw_imm_ud(word->u[0]));
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 6b9bfcf0b85..c1690ad45c3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -939,7 +939,7 @@ fs_visitor::emit_barrier()
    /* Clear the message payload */
    pbld.MOV(payload, brw_imm_ud(0u));
 
-   /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */
+   /* Copy the barrier id from r0.2 to the message payload reg.2 */
    fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
    pbld.AND(component(payload, 2), r0_2, brw_imm_ud(barrier_id_mask));
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index c6f0b0d8a2a..6bd992882b8 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -254,8 +254,8 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 
 static bool
 try_copy_propagate(const struct brw_device_info *devinfo,
-                   vec4_instruction *inst,
-                   int arg, struct copy_entry *entry)
+                   vec4_instruction *inst, int arg,
+                   struct copy_entry *entry, int attributes_per_reg)
 {
    /* Build up the value we are propagating as if it were the source of a
     * single MOV
@@ -320,7 +320,8 @@ try_copy_propagate(const struct brw_device_info *devinfo,
    unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
                                                    value.swizzle);
    if (inst->is_3src() &&
-       value.file == UNIFORM &&
+       (value.file == UNIFORM ||
+        (value.file == ATTR && attributes_per_reg != 1)) &&
        !brw_is_single_value_swizzle(composed_swizzle))
       return false;
 
@@ -395,6 +396,11 @@ try_copy_propagate(const struct brw_device_info *devinfo,
 bool
 vec4_visitor::opt_copy_propagation(bool do_constant_prop)
 {
+   /* If we are in dual instanced or single mode, then attributes are going
+    * to be interleaved, so one register contains two attribute slots.
+    */
+   const int attributes_per_reg =
+      prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
    bool progress = false;
    struct copy_entry entries[alloc.total_size];
 
@@ -465,7 +471,7 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop)
          if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
             progress = true;
 
-	 if (try_copy_propagate(devinfo, inst, i, &entry))
+         if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
 	    progress = true;
       }
 
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index 904950dfa07..0df25d2557c 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -210,7 +210,7 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
 {
    const unsigned depth = max_layer - min_layer;
    struct intel_mipmap_tree *aux_mt = NULL;
-   uint32_t aux_mode = 0;
+   uint32_t aux_mode = GEN8_SURFACE_AUX_MODE_NONE;
    uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
    int surf_index = surf_offset - &brw->wm.base.surf_offset[0];
    unsigned tiling_mode, pitch;
@@ -425,7 +425,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_mipmap_tree *mt = irb->mt;
    struct intel_mipmap_tree *aux_mt = NULL;
-   uint32_t aux_mode = 0;
+   uint32_t aux_mode = GEN8_SURFACE_AUX_MODE_NONE;
    unsigned width = mt->logical_width0;
    unsigned height = mt->logical_height0;
    unsigned pitch = mt->pitch;
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 8ede1f06e4e..de1aba44c1b 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -32,6 +32,7 @@
 
 #include <stdbool.h>
 #include <inttypes.h>  /* for PRId64 macro */
+#include "util/debug.h"
 #include "glheader.h"
 #include "enums.h"
 #include "hash.h"
@@ -120,6 +121,10 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
       return &ctx->CopyReadBuffer;
    case GL_COPY_WRITE_BUFFER:
       return &ctx->CopyWriteBuffer;
+   case GL_QUERY_BUFFER:
+      if (_mesa_has_ARB_query_buffer_object(ctx))
+         return &ctx->QueryBuffer;
+      break;
    case GL_DRAW_INDIRECT_BUFFER:
       if ((ctx->API == API_OPENGL_CORE &&
            ctx->Extensions.ARB_draw_indirect) ||
@@ -458,6 +463,7 @@ _mesa_delete_buffer_object(struct gl_context *ctx,
 {
    (void) ctx;
 
+   vbo_delete_minmax_cache(bufObj);
    _mesa_align_free(bufObj->Data);
 
    /* assign strange values here to help w/ debugging */
@@ -519,6 +525,24 @@ _mesa_reference_buffer_object_(struct gl_context *ctx,
 }
 
 
+/**
+ * Get the value of MESA_NO_MINMAX_CACHE.
+ */
+static bool
+get_no_minmax_cache()
+{
+   static bool read = false;
+   static bool disable = false;
+
+   if (!read) {
+      disable = env_var_as_boolean("MESA_NO_MINMAX_CACHE", false);
+      read = true;
+   }
+
+   return disable;
+}
+
+
 /**
  * Initialize a buffer object to default values.
  */
@@ -532,6 +556,9 @@ _mesa_initialize_buffer_object(struct gl_context *ctx,
    obj->RefCount = 1;
    obj->Name = name;
    obj->Usage = GL_STATIC_DRAW_ARB;
+
+   if (get_no_minmax_cache())
+      obj->UsageHistory |= USAGE_DISABLE_MINMAX_CACHE;
 }
 
 
@@ -877,6 +904,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
    _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer,
 				 ctx->Shared->NullBufferObj);
 
+   _mesa_reference_buffer_object(ctx, &ctx->QueryBuffer,
+                                 ctx->Shared->NullBufferObj);
+
    for (i = 0; i < MAX_COMBINED_UNIFORM_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->UniformBufferBindings[i].BufferObject,
@@ -925,6 +955,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 
    _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer, NULL);
 
+   _mesa_reference_buffer_object(ctx, &ctx->QueryBuffer, NULL);
+
    for (i = 0; i < MAX_COMBINED_UNIFORM_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->UniformBufferBindings[i].BufferObject,
@@ -1014,6 +1046,15 @@ bind_buffer_object(struct gl_context *ctx, GLenum target, GLuint buffer)
          return;
    }
 
+   /* record usage history */
+   switch (target) {
+   case GL_PIXEL_PACK_BUFFER:
+      newBufObj->UsageHistory |= USAGE_PIXEL_PACK_BUFFER;
+      break;
+   default:
+      break;
+   }
+
    /* bind new buffer */
    _mesa_reference_buffer_object(ctx, bindTarget, newBufObj);
 }
@@ -1348,6 +1389,11 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
             _mesa_BindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
          }
 
+         /* unbind query buffer binding point */
+         if (ctx->QueryBuffer == bufObj) {
+            _mesa_BindBuffer(GL_QUERY_BUFFER, 0);
+         }
+
          /* The ID is immediately freed for re-use */
          _mesa_HashRemove(ctx->Shared->BufferObjects, ids[i]);
          /* Make sure we do not run into the classic ABA problem on bind.
@@ -1519,6 +1565,7 @@ _mesa_buffer_storage(struct gl_context *ctx, struct gl_buffer_object *bufObj,
 
    bufObj->Written = GL_TRUE;
    bufObj->Immutable = GL_TRUE;
+   bufObj->MinMaxCacheDirty = true;
 
    assert(ctx->Driver.BufferData);
    if (!ctx->Driver.BufferData(ctx, target, size, data, GL_DYNAMIC_DRAW,
@@ -1632,6 +1679,7 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
    FLUSH_VERTICES(ctx, _NEW_BUFFER_OBJECT);
 
    bufObj->Written = GL_TRUE;
+   bufObj->MinMaxCacheDirty = true;
 
 #ifdef VBO_DEBUG
    printf("glBufferDataARB(%u, sz %ld, from %p, usage 0x%x)\n",
@@ -1744,6 +1792,7 @@ _mesa_buffer_sub_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
    }
 
    bufObj->Written = GL_TRUE;
+   bufObj->MinMaxCacheDirty = true;
 
    assert(ctx->Driver.BufferSubData);
    ctx->Driver.BufferSubData(ctx, offset, size, data, bufObj);
@@ -1859,12 +1908,16 @@ _mesa_clear_buffer_sub_data(struct gl_context *ctx,
       return;
    }
 
+   /* Bail early. Negative size has already been checked. */
+   if (size == 0)
+      return;
+
+   bufObj->MinMaxCacheDirty = true;
+
    if (data == NULL) {
       /* clear to zeros, per the spec */
-      if (size > 0) {
-         ctx->Driver.ClearBufferSubData(ctx, offset, size,
-                                        NULL, clearValueSize, bufObj);
-      }
+      ctx->Driver.ClearBufferSubData(ctx, offset, size,
+                                     NULL, clearValueSize, bufObj);
       return;
    }
 
@@ -1873,10 +1926,8 @@ _mesa_clear_buffer_sub_data(struct gl_context *ctx,
       return;
    }
 
-   if (size > 0) {
-      ctx->Driver.ClearBufferSubData(ctx, offset, size,
-                                     clearValue, clearValueSize, bufObj);
-   }
+   ctx->Driver.ClearBufferSubData(ctx, offset, size,
+                                  clearValue, clearValueSize, bufObj);
 }
 
 void GLAPIENTRY
@@ -2276,6 +2327,8 @@ _mesa_copy_buffer_sub_data(struct gl_context *ctx,
       }
    }
 
+   dst->MinMaxCacheDirty = true;
+
    ctx->Driver.CopyBufferSubData(ctx, src, dst, readOffset, writeOffset, size);
 }
 
@@ -2480,8 +2533,10 @@ _mesa_map_buffer_range(struct gl_context *ctx,
       assert(bufObj->Mappings[MAP_USER].AccessFlags == access);
    }
 
-   if (access & GL_MAP_WRITE_BIT)
+   if (access & GL_MAP_WRITE_BIT) {
       bufObj->Written = GL_TRUE;
+      bufObj->MinMaxCacheDirty = true;
+   }
 
 #ifdef VBO_DEBUG
    if (strstr(func, "Range") == NULL) { /* If not MapRange */
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index d4378e51159..19ef3042548 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -48,6 +48,7 @@ struct gl_shader;
 struct gl_shader_program;
 struct gl_texture_image;
 struct gl_texture_object;
+struct gl_memory_info;
 
 /* GL_ARB_vertex_buffer_object */
 /* Modifies GL_MAP_UNSYNCHRONIZED_BIT to allow driver to fail (return
@@ -726,6 +727,15 @@ struct dd_function_table {
    void (*EndQuery)(struct gl_context *ctx, struct gl_query_object *q);
    void (*CheckQuery)(struct gl_context *ctx, struct gl_query_object *q);
    void (*WaitQuery)(struct gl_context *ctx, struct gl_query_object *q);
+   /*
+    * \pname the value requested to be written (GL_QUERY_RESULT, etc)
+    * \ptype the type of the value requested to be written:
+    *    GL_UNSIGNED_INT, GL_UNSIGNED_INT64_ARB,
+    *    GL_INT, GL_INT64_ARB
+    */
+   void (*StoreQueryResult)(struct gl_context *ctx, struct gl_query_object *q,
+                            struct gl_buffer_object *buf, intptr_t offset,
+                            GLenum pname, GLenum ptype);
    /*@}*/
 
    /**
@@ -939,6 +949,13 @@ struct dd_function_table {
    void (*DispatchCompute)(struct gl_context *ctx, const GLuint *num_groups);
    void (*DispatchComputeIndirect)(struct gl_context *ctx, GLintptr indirect);
    /*@}*/
+
+   /**
+    * Query information about memory. Device memory is e.g. VRAM. Staging
+    * memory is e.g. GART. All sizes are in kilobytes.
+    */
+   void (*QueryMemoryInfo)(struct gl_context *ctx,
+                           struct gl_memory_info *info);
 };
 
 
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 11f4482f8d2..ded6f2c06dc 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -88,6 +88,7 @@ EXT(ARB_point_parameters                    , EXT_point_parameters
 EXT(ARB_point_sprite                        , ARB_point_sprite                       , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_program_interface_query             , dummy_true                             , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_provoking_vertex                    , EXT_provoking_vertex                   , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_query_buffer_object                 , ARB_query_buffer_object                , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_robustness                          , dummy_true                             , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_sample_shading                      , ARB_sample_shading                     , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_sampler_objects                     , dummy_true                             , GLL, GLC,  x ,  x , 2009)
@@ -165,6 +166,7 @@ EXT(ARB_window_pos                          , dummy_true
 EXT(ATI_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
 EXT(ATI_draw_buffers                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
 EXT(ATI_fragment_shader                     , ATI_fragment_shader                    , GLL,  x ,  x ,  x , 2001)
+EXT(ATI_meminfo                             , ATI_meminfo                            , GLL, GLC,  x ,  x , 2009)
 EXT(ATI_separate_stencil                    , ATI_separate_stencil                   , GLL,  x ,  x ,  x , 2006)
 EXT(ATI_texture_compression_3dc             , ATI_texture_compression_3dc            , GLL,  x ,  x ,  x , 2004)
 EXT(ATI_texture_env_combine3                , ATI_texture_env_combine3               , GLL,  x ,  x ,  x , 2002)
@@ -291,6 +293,7 @@ EXT(NV_texture_barrier                      , NV_texture_barrier
 EXT(NV_texture_env_combine4                 , NV_texture_env_combine4                , GLL,  x ,  x ,  x , 1999)
 EXT(NV_texture_rectangle                    , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2000)
 EXT(NV_vdpau_interop                        , NV_vdpau_interop                       , GLL, GLC,  x ,  x , 2010)
+EXT(NVX_gpu_memory_info                     , NVX_gpu_memory_info                    , GLL, GLC,  x ,  x , 2013)
 
 EXT(OES_EGL_image                           , OES_EGL_image                          , GLL, GLC, ES1, ES2, 2006) /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
 EXT(OES_EGL_image_external                  , OES_EGL_image_external                 ,  x ,  x , ES1, ES2, 2010)
@@ -311,6 +314,7 @@ EXT(OES_element_index_uint                  , dummy_true
 EXT(OES_fbo_render_mipmap                   , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_fixed_point                         , dummy_true                             ,  x ,  x , ES1,  x , 2002)
 EXT(OES_framebuffer_object                  , dummy_true                             ,  x ,  x , ES1,  x , 2005)
+EXT(OES_geometry_point_size                 , OES_geometry_shader                    ,  x ,  x ,  x ,  31, 2015)
 EXT(OES_geometry_shader                     , OES_geometry_shader                    ,  x ,  x ,  x ,  31, 2015)
 EXT(OES_get_program_binary                  , dummy_true                             ,  x ,  x ,  x , ES2, 2008)
 EXT(OES_mapbuffer                           , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 3be216da234..2d4acb35bd6 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1414,6 +1414,9 @@ framebuffer_parameteri(struct gl_context *ctx, struct gl_framebuffer *fb,
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(pname=0x%x)", func, pname);
    }
+
+   invalidate_framebuffer(fb);
+   ctx->NewState |= _NEW_BUFFERS;
 }
 
 void GLAPIENTRY
diff --git a/src/mesa/main/format_parser.py b/src/mesa/main/format_parser.py
index 799b14f0b1c..a29f20754a8 100755
--- a/src/mesa/main/format_parser.py
+++ b/src/mesa/main/format_parser.py
@@ -532,7 +532,7 @@ def _parse_channels(fields, layout, colorspace, swizzle):
    return channels
 
 def parse(filename):
-   """Parse a format descrition in CSV format.
+   """Parse a format description in CSV format.
 
    This function parses the given CSV file and returns an iterable of
    channels."""
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index bfc8a0836e7..fa434d447ae 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -26,7 +26,7 @@
 #ifndef FRAMEBUFFER_H
 #define FRAMEBUFFER_H
 
-#include "glheader.h"
+#include "mtypes.h"
 
 struct gl_config;
 struct gl_context;
@@ -97,7 +97,8 @@ static inline GLuint
 _mesa_geometric_samples(const struct gl_framebuffer *buffer)
 {
    return buffer->_HasAttachments ?
-      buffer->Visual.samples : buffer->DefaultGeometry.NumSamples;
+      buffer->Visual.samples :
+      buffer->DefaultGeometry._NumSamples;
 }
 
 static inline GLuint
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 95cb18c8ee8..8453a922549 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -147,11 +147,14 @@ enum value_extra {
    EXTRA_VALID_CLIP_DISTANCE,
    EXTRA_FLUSH_CURRENT,
    EXTRA_GLSL_130,
-   EXTRA_EXT_UBO_GS4,
-   EXTRA_EXT_ATOMICS_GS4,
-   EXTRA_EXT_SHADER_IMAGE_GS4,
+   EXTRA_EXT_UBO_GS,
+   EXTRA_EXT_ATOMICS_GS,
+   EXTRA_EXT_SHADER_IMAGE_GS,
    EXTRA_EXT_ATOMICS_TESS,
    EXTRA_EXT_SHADER_IMAGE_TESS,
+   EXTRA_EXT_SSBO_GS,
+   EXTRA_EXT_FB_NO_ATTACH_GS,
+   EXTRA_EXT_ES_GS,
 };
 
 #define NO_EXTRA NULL
@@ -308,7 +311,7 @@ static const int extra_ARB_transform_feedback2_api_es3[] = {
 };
 
 static const int extra_ARB_uniform_buffer_object_and_geometry_shader[] = {
-   EXTRA_EXT_UBO_GS4,
+   EXTRA_EXT_UBO_GS,
    EXTRA_END
 };
 
@@ -343,12 +346,12 @@ static const int extra_EXT_texture_array_es3[] = {
 };
 
 static const int extra_ARB_shader_atomic_counters_and_geometry_shader[] = {
-   EXTRA_EXT_ATOMICS_GS4,
+   EXTRA_EXT_ATOMICS_GS,
    EXTRA_END
 };
 
 static const int extra_ARB_shader_image_load_store_and_geometry_shader[] = {
-   EXTRA_EXT_SHADER_IMAGE_GS4,
+   EXTRA_EXT_SHADER_IMAGE_GS,
    EXTRA_END
 };
 
@@ -375,6 +378,28 @@ static const int extra_ARB_shader_storage_buffer_object_es31[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_shader_storage_buffer_object_and_geometry_shader[] = {
+   EXTRA_EXT_SSBO_GS,
+   EXTRA_END
+};
+
+static const int extra_ARB_framebuffer_no_attachments_and_geometry_shader[] = {
+   EXTRA_EXT_FB_NO_ATTACH_GS,
+   EXTRA_END
+};
+
+static const int extra_ARB_viewport_array_or_oes_geometry_shader[] = {
+   EXT(ARB_viewport_array),
+   EXTRA_EXT_ES_GS,
+   EXTRA_END
+};
+
+static const int extra_ARB_gpu_shader5_or_oes_geometry_shader[] = {
+   EXT(ARB_gpu_shader5),
+   EXTRA_EXT_ES_GS,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
@@ -414,6 +439,7 @@ EXTRA_EXT(ARB_shader_image_load_store);
 EXTRA_EXT(ARB_viewport_array);
 EXTRA_EXT(ARB_compute_shader);
 EXTRA_EXT(ARB_gpu_shader5);
+EXTRA_EXT(ARB_query_buffer_object);
 EXTRA_EXT2(ARB_transform_feedback3, ARB_gpu_shader5);
 EXTRA_EXT(INTEL_performance_query);
 EXTRA_EXT(ARB_explicit_uniform_location);
@@ -424,6 +450,8 @@ EXTRA_EXT(ARB_tessellation_shader);
 EXTRA_EXT(ARB_shader_subroutine);
 EXTRA_EXT(ARB_shader_storage_buffer_object);
 EXTRA_EXT(ARB_indirect_parameters);
+EXTRA_EXT(ATI_meminfo);
+EXTRA_EXT(NVX_gpu_memory_info);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -455,6 +483,12 @@ static const int extra_gl32_es3[] = {
     EXTRA_END,
 };
 
+static const int extra_version_32_OES_geometry_shader[] = {
+    EXTRA_VERSION_32,
+    EXTRA_EXT_ES_GS,
+    EXTRA_END
+};
+
 static const int extra_gl40_ARB_sample_shading[] = {
    EXTRA_VERSION_40,
    EXT(ARB_sample_shading),
@@ -1006,6 +1040,10 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
    case GL_SHADER_STORAGE_BUFFER_BINDING:
       v->value_int = ctx->ShaderStorageBuffer->Name;
       break;
+   /* GL_ARB_query_buffer_object */
+   case GL_QUERY_BUFFER_BINDING:
+      v->value_int = ctx->QueryBuffer->Name;
+      break;
    /* GL_ARB_timer_query */
    case GL_TIMESTAMP:
       if (ctx->Driver.GetTimestamp) {
@@ -1049,6 +1087,60 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
    case GL_DISPATCH_INDIRECT_BUFFER_BINDING:
       v->value_int = ctx->DispatchIndirectBuffer->Name;
       break;
+   /* GL_ARB_multisample */
+   case GL_SAMPLES:
+      v->value_int = _mesa_geometric_samples(ctx->DrawBuffer);
+      break;
+   case GL_SAMPLE_BUFFERS:
+      v->value_int = _mesa_geometric_samples(ctx->DrawBuffer) > 0;
+      break;
+   /* GL_ATI_meminfo & GL_NVX_gpu_memory_info */
+   case GL_VBO_FREE_MEMORY_ATI:
+   case GL_TEXTURE_FREE_MEMORY_ATI:
+   case GL_RENDERBUFFER_FREE_MEMORY_ATI:
+   case GL_GPU_MEMORY_INFO_DEDICATED_VIDMEM_NVX:
+   case GL_GPU_MEMORY_INFO_TOTAL_AVAILABLE_MEMORY_NVX:
+   case GL_GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX:
+   case GL_GPU_MEMORY_INFO_EVICTION_COUNT_NVX:
+   case GL_GPU_MEMORY_INFO_EVICTED_MEMORY_NVX:
+      {
+         struct gl_memory_info info;
+
+         ctx->Driver.QueryMemoryInfo(ctx, &info);
+
+         if (d->pname == GL_GPU_MEMORY_INFO_DEDICATED_VIDMEM_NVX)
+            v->value_int = info.total_device_memory;
+         else if (d->pname == GL_GPU_MEMORY_INFO_TOTAL_AVAILABLE_MEMORY_NVX)
+            v->value_int = info.total_device_memory +
+                           info.total_staging_memory;
+         else if (d->pname == GL_GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX)
+            v->value_int = info.avail_device_memory;
+         else if (d->pname == GL_GPU_MEMORY_INFO_EVICTION_COUNT_NVX)
+            v->value_int = info.nr_device_memory_evictions;
+         else if (d->pname == GL_GPU_MEMORY_INFO_EVICTED_MEMORY_NVX)
+            v->value_int = info.device_memory_evicted;
+         else {
+            /* ATI free memory enums.
+             *
+             * Since the GPU memory is (usually) page-table based, every two
+             * consecutive elements are equal. From the GL_ATI_meminfo
+             * specification:
+             *
+             *    "param[0] - total memory free in the pool
+             *     param[1] - largest available free block in the pool
+             *     param[2] - total auxiliary memory free
+             *     param[3] - largest auxiliary free block"
+             *
+             * All three (VBO, TEXTURE, RENDERBUFFER) queries return
+             * the same numbers here.
+             */
+            v->value_int_4[0] = info.avail_device_memory;
+            v->value_int_4[1] = info.avail_device_memory;
+            v->value_int_4[2] = info.avail_staging_memory;
+            v->value_int_4[3] = info.avail_staging_memory;
+         }
+      }
+      break;
    }
 }
 
@@ -1154,20 +1246,23 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          if (ctx->Const.GLSLVersion >= 130)
             api_found = GL_TRUE;
 	 break;
-      case EXTRA_EXT_UBO_GS4:
+      case EXTRA_EXT_UBO_GS:
          api_check = GL_TRUE;
-         api_found = (ctx->Extensions.ARB_uniform_buffer_object &&
-                      _mesa_has_geometry_shaders(ctx));
+         if (ctx->Extensions.ARB_uniform_buffer_object &&
+            _mesa_has_geometry_shaders(ctx))
+            api_found = GL_TRUE;
          break;
-      case EXTRA_EXT_ATOMICS_GS4:
+      case EXTRA_EXT_ATOMICS_GS:
          api_check = GL_TRUE;
-         api_found = (ctx->Extensions.ARB_shader_atomic_counters &&
-                      _mesa_has_geometry_shaders(ctx));
+         if (ctx->Extensions.ARB_shader_atomic_counters &&
+            _mesa_has_geometry_shaders(ctx))
+            api_found = GL_TRUE;
          break;
-      case EXTRA_EXT_SHADER_IMAGE_GS4:
+      case EXTRA_EXT_SHADER_IMAGE_GS:
          api_check = GL_TRUE;
-         api_found = (ctx->Extensions.ARB_shader_image_load_store &&
-                      _mesa_has_geometry_shaders(ctx));
+         if (ctx->Extensions.ARB_shader_image_load_store &&
+            _mesa_has_geometry_shaders(ctx))
+            api_found = GL_TRUE;
          break;
       case EXTRA_EXT_ATOMICS_TESS:
          api_check = GL_TRUE;
@@ -1179,6 +1274,24 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          api_found = ctx->Extensions.ARB_shader_image_load_store &&
                      _mesa_has_tessellation(ctx);
          break;
+      case EXTRA_EXT_SSBO_GS:
+         api_check = GL_TRUE;
+         if (ctx->Extensions.ARB_shader_storage_buffer_object &&
+            _mesa_has_geometry_shaders(ctx))
+            api_found = GL_TRUE;
+         break;
+      case EXTRA_EXT_FB_NO_ATTACH_GS:
+         api_check = GL_TRUE;
+         if (ctx->Extensions.ARB_framebuffer_no_attachments &&
+            (_mesa_is_desktop_gl(ctx) ||
+            _mesa_has_OES_geometry_shader(ctx)))
+            api_found = GL_TRUE;
+         break;
+      case EXTRA_EXT_ES_GS:
+         api_check = GL_TRUE;
+         if (_mesa_has_OES_geometry_shader(ctx))
+            api_found = GL_TRUE;
+         break;
       case EXTRA_END:
 	 break;
       default: /* *e is a offset into the extension struct */
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index af7a8f4a906..164095c103c 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -80,8 +80,8 @@ descriptor=[
   [ "SAMPLE_COVERAGE_ARB", "CONTEXT_BOOL(Multisample.SampleCoverage), NO_EXTRA" ],
   [ "SAMPLE_COVERAGE_VALUE_ARB", "CONTEXT_FLOAT(Multisample.SampleCoverageValue), NO_EXTRA" ],
   [ "SAMPLE_COVERAGE_INVERT_ARB", "CONTEXT_BOOL(Multisample.SampleCoverageInvert), NO_EXTRA" ],
-  [ "SAMPLE_BUFFERS_ARB", "BUFFER_INT(Visual.sampleBuffers), extra_new_buffers" ],
-  [ "SAMPLES_ARB", "BUFFER_INT(Visual.samples), extra_new_buffers" ],
+  [ "SAMPLE_BUFFERS_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_new_buffers" ],
+  [ "SAMPLES_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_new_buffers" ],
 
 # GL_ARB_sample_shading
   [ "SAMPLE_SHADING_ARB", "CONTEXT_BOOL(Multisample.SampleShading), extra_gl40_ARB_sample_shading" ],
@@ -470,6 +470,9 @@ descriptor=[
   ["MAX_FRAMEBUFFER_HEIGHT", "CONTEXT_INT(Const.MaxFramebufferHeight), extra_ARB_framebuffer_no_attachments"],
   ["MAX_FRAMEBUFFER_SAMPLES", "CONTEXT_INT(Const.MaxFramebufferSamples), extra_ARB_framebuffer_no_attachments"],
 
+# GL_ARB_framebuffer_no_attachments / geometry shader
+  [ "MAX_FRAMEBUFFER_LAYERS", "CONTEXT_INT(Const.MaxFramebufferLayers), extra_ARB_framebuffer_no_attachments_and_geometry_shader" ],
+
 # GL_ARB_explicit_uniform_location / GLES 3.1
   [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location" ],
 
@@ -499,6 +502,34 @@ descriptor=[
 { "apis": ["GL_CORE", "GLES31"], "params": [
 # GL_ARB_draw_indirect / GLES 3.1
   [ "DRAW_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_draw_indirect" ],
+
+# GL 3.2 / GL OES_geometry_shader
+  [ "MAX_GEOMETRY_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxInputComponents), extra_version_32_OES_geometry_shader" ],
+  [ "MAX_GEOMETRY_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxOutputComponents), extra_version_32_OES_geometry_shader" ],
+  [ "MAX_GEOMETRY_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits), extra_version_32_OES_geometry_shader" ],
+  [ "MAX_GEOMETRY_OUTPUT_VERTICES", "CONTEXT_INT(Const.MaxGeometryOutputVertices), extra_version_32_OES_geometry_shader" ],
+  [ "MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.MaxGeometryTotalOutputComponents), extra_version_32_OES_geometry_shader" ],
+  [ "MAX_GEOMETRY_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxUniformComponents), extra_version_32_OES_geometry_shader" ],
+
+# GL_ARB_shader_image_load_store / geometry shader
+  [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader" ],
+
+# GL_ARB_shader_atomic_counters / geometry shader
+  [ "MAX_GEOMETRY_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_geometry_shader " ],
+  [ "MAX_GEOMETRY_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
+
+# GL_ARB_shader_storage_buffer_object / geometry shader
+  [ "MAX_GEOMETRY_SHADER_STORAGE_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxShaderStorageBlocks), extra_ARB_shader_storage_buffer_object_and_geometry_shader" ],
+
+# GL_ARB_uniform_buffer_object / geometry shader
+  [ "MAX_GEOMETRY_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxUniformBlocks), extra_ARB_uniform_buffer_object_and_geometry_shader" ],
+  [ "MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxCombinedUniformComponents), extra_ARB_uniform_buffer_object_and_geometry_shader" ],
+
+# GL_ARB_viewport_array / GL_OES_geometry_shader
+  [ "LAYER_PROVOKING_VERTEX", "CONTEXT_ENUM(Light.ProvokingVertex), extra_ARB_viewport_array_or_oes_geometry_shader" ],
+
+# GL_ARB_gpu_shader5 / GL_OES_geometry_shader
+  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5_or_oes_geometry_shader" ],
 ]},
 
 # Remaining enums are only in OpenGL
@@ -790,21 +821,10 @@ descriptor=[
 
 # GL 3.2
   [ "CONTEXT_PROFILE_MASK", "CONTEXT_INT(Const.ProfileMask), extra_version_32" ],
-  [ "MAX_GEOMETRY_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxInputComponents), extra_version_32" ],
-  [ "MAX_GEOMETRY_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxOutputComponents), extra_version_32" ],
-  [ "MAX_GEOMETRY_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits), extra_version_32" ],
-  [ "MAX_GEOMETRY_OUTPUT_VERTICES", "CONTEXT_INT(Const.MaxGeometryOutputVertices), extra_version_32" ],
-  [ "MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.MaxGeometryTotalOutputComponents), extra_version_32" ],
-  [ "MAX_GEOMETRY_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxUniformComponents), extra_version_32" ],
 
 # GL_ARB_robustness
   [ "RESET_NOTIFICATION_STRATEGY_ARB", "CONTEXT_ENUM(Const.ResetStrategy), NO_EXTRA" ],
 
-
-# GL_ARB_uniform_buffer_object
-  [ "MAX_GEOMETRY_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxUniformBlocks), extra_ARB_uniform_buffer_object_and_geometry_shader" ],
-  [ "MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxCombinedUniformComponents), extra_ARB_uniform_buffer_object_and_geometry_shader" ],
-
 # GL_ARB_timer_query
   [ "TIMESTAMP", "LOC_CUSTOM, TYPE_INT64, 0, extra_ARB_timer_query" ],
 
@@ -817,25 +837,31 @@ descriptor=[
 # GL_ARB_texture_gather
   [ "MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB", "CONTEXT_INT(Const.MaxProgramTextureGatherComponents), extra_ARB_texture_gather"],
 
-# GL_ARB_shader_atomic_counters
-  [ "MAX_GEOMETRY_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
-  [ "MAX_GEOMETRY_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
-
 # GL_ARB_shader_image_load_store
   [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store" ],
   [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
-  [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader"],
-
-# GL_ARB_framebuffer_no_attachments
-  ["MAX_FRAMEBUFFER_LAYERS", "CONTEXT_INT(Const.MaxFramebufferLayers), extra_ARB_framebuffer_no_attachments"],
 
 # GL_EXT_polygon_offset_clamp
   [ "POLYGON_OFFSET_CLAMP_EXT", "CONTEXT_FLOAT(Polygon.OffsetClamp), extra_EXT_polygon_offset_clamp" ],
 
 # GL_ARB_shader_storage_buffer_object
-  [ "MAX_GEOMETRY_SHADER_STORAGE_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxShaderStorageBlocks), extra_ARB_shader_storage_buffer_object" ],
   [ "MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxShaderStorageBlocks), extra_ARB_shader_storage_buffer_object" ],
   [ "MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxShaderStorageBlocks), extra_ARB_shader_storage_buffer_object" ],
+
+# GL_ARB_query_buffer_object
+  [ "QUERY_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_query_buffer_object" ],
+
+# GL_ATI_meminfo
+  [ "VBO_FREE_MEMORY_ATI", "LOC_CUSTOM, TYPE_INT_4, NO_OFFSET, extra_ATI_meminfo" ],
+  [ "TEXTURE_FREE_MEMORY_ATI", "LOC_CUSTOM, TYPE_INT_4, NO_OFFSET, extra_ATI_meminfo" ],
+  [ "RENDERBUFFER_FREE_MEMORY_ATI", "LOC_CUSTOM, TYPE_INT_4, NO_OFFSET, extra_ATI_meminfo" ],
+
+# GL_NVX_gpu_memory_info
+  [ "GPU_MEMORY_INFO_DEDICATED_VIDMEM_NVX", "LOC_CUSTOM, TYPE_INT, NO_OFFSET, extra_NVX_gpu_memory_info" ],
+  [ "GPU_MEMORY_INFO_TOTAL_AVAILABLE_MEMORY_NVX", "LOC_CUSTOM, TYPE_INT, NO_OFFSET, extra_NVX_gpu_memory_info" ],
+  [ "GPU_MEMORY_INFO_CURRENT_AVAILABLE_VIDMEM_NVX", "LOC_CUSTOM, TYPE_INT, NO_OFFSET, extra_NVX_gpu_memory_info" ],
+  [ "GPU_MEMORY_INFO_EVICTION_COUNT_NVX", "LOC_CUSTOM, TYPE_INT, NO_OFFSET, extra_NVX_gpu_memory_info" ],
+  [ "GPU_MEMORY_INFO_EVICTED_MEMORY_NVX", "LOC_CUSTOM, TYPE_INT, NO_OFFSET, extra_NVX_gpu_memory_info" ],
 ]},
 
 # Enums restricted to OpenGL Core profile
@@ -847,7 +873,6 @@ descriptor=[
   [ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ],
   [ "VIEWPORT_SUBPIXEL_BITS", "CONTEXT_INT(Const.ViewportSubpixelBits), extra_ARB_viewport_array" ],
   [ "VIEWPORT_BOUNDS_RANGE", "CONTEXT_FLOAT2(Const.ViewportBounds), extra_ARB_viewport_array" ],
-  [ "LAYER_PROVOKING_VERTEX", "CONTEXT_ENUM(Const.LayerAndVPIndexProvokingVertex), extra_ARB_viewport_array" ],
   [ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Const.LayerAndVPIndexProvokingVertex), extra_ARB_viewport_array" ],
 
 # GL_ARB_gpu_shader5
diff --git a/src/mesa/main/hash.c b/src/mesa/main/hash.c
index 315b5d64004..ab1b9e907ae 100644
--- a/src/mesa/main/hash.c
+++ b/src/mesa/main/hash.c
@@ -496,14 +496,12 @@ _mesa_HashFindFreeKeyBlock(struct _mesa_HashTable *table, GLuint numKeys)
 GLuint
 _mesa_HashNumEntries(const struct _mesa_HashTable *table)
 {
-   struct hash_entry *entry;
    GLuint count = 0;
 
    if (table->deleted_key_data)
       count++;
 
-   hash_table_foreach(table->ht, entry)
-      count++;
+   count += _mesa_hash_table_num_entries(table->ht);
 
    return count;
 }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 56dce2d1b81..a66b56c62bf 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1253,6 +1253,9 @@ typedef enum {
    USAGE_TEXTURE_BUFFER = 0x2,
    USAGE_ATOMIC_COUNTER_BUFFER = 0x4,
    USAGE_SHADER_STORAGE_BUFFER = 0x8,
+   USAGE_TRANSFORM_FEEDBACK_BUFFER = 0x10,
+   USAGE_PIXEL_PACK_BUFFER = 0x20,
+   USAGE_DISABLE_MINMAX_CACHE = 0x40,
 } gl_buffer_usage;
 
 
@@ -1280,6 +1283,12 @@ struct gl_buffer_object
    GLuint NumMapBufferWriteCalls;
 
    struct gl_buffer_mapping Mappings[MAP_COUNT];
+
+   /** Memoization of min/max index computations for static index buffers */
+   struct hash_table *MinMaxCache;
+   unsigned MinMaxCacheHitIndices;
+   unsigned MinMaxCacheMissIndices;
+   bool MinMaxCacheDirty;
 };
 
 
@@ -1861,6 +1870,8 @@ typedef enum
    PROGRAM_SAMPLER,     /**< for shader samplers, compile-time only */
    PROGRAM_SYSTEM_VALUE,/**< InstanceId, PrimitiveID, etc. */
    PROGRAM_UNDEFINED,   /**< Invalid/TBD value */
+   PROGRAM_IMMEDIATE,   /**< Immediate value, used by TGSI */
+   PROGRAM_BUFFER,      /**< for shader buffers, compile-time only */
    PROGRAM_FILE_MAX
 } gl_register_file;
 
@@ -3217,6 +3228,10 @@ struct gl_framebuffer
    struct {
      GLuint Width, Height, Layers, NumSamples;
      GLboolean FixedSampleLocations;
+     /* Derived from NumSamples by the driver so that it can choose a valid
+      * value for the hardware.
+      */
+     GLuint _NumSamples;
    } DefaultGeometry;
 
    /** \name  Drawing bounds (Intersection of buffer size and scissor box)
@@ -3785,6 +3800,7 @@ struct gl_extensions
    GLboolean ARB_occlusion_query2;
    GLboolean ARB_pipeline_statistics_query;
    GLboolean ARB_point_sprite;
+   GLboolean ARB_query_buffer_object;
    GLboolean ARB_sample_shading;
    GLboolean ARB_seamless_cube_map;
    GLboolean ARB_shader_atomic_counters;
@@ -3880,6 +3896,7 @@ struct gl_extensions
    GLboolean AMD_vertex_shader_layer;
    GLboolean AMD_vertex_shader_viewport_index;
    GLboolean APPLE_object_purgeable;
+   GLboolean ATI_meminfo;
    GLboolean ATI_texture_compression_3dc;
    GLboolean ATI_texture_mirror_once;
    GLboolean ATI_texture_env_combine3;
@@ -3900,6 +3917,7 @@ struct gl_extensions
    GLboolean NV_texture_env_combine4;
    GLboolean NV_texture_rectangle;
    GLboolean NV_vdpau_interop;
+   GLboolean NVX_gpu_memory_info;
    GLboolean TDFX_texture_compression_FXT1;
    GLboolean OES_EGL_image;
    GLboolean OES_draw_texture;
@@ -4434,6 +4452,8 @@ struct gl_context
    struct gl_buffer_object *CopyReadBuffer; /**< GL_ARB_copy_buffer */
    struct gl_buffer_object *CopyWriteBuffer; /**< GL_ARB_copy_buffer */
 
+   struct gl_buffer_object *QueryBuffer; /**< GL_ARB_query_buffer_object */
+
    /**
     * Current GL_ARB_uniform_buffer_object binding referenced by
     * GL_UNIFORM_BUFFER target for glBufferData, glMapBuffer, etc.
@@ -4576,6 +4596,18 @@ struct gl_context
    GLboolean ShareGroupReset;
 };
 
+/**
+ * Information about memory usage. All sizes are in kilobytes.
+ */
+struct gl_memory_info
+{
+   unsigned total_device_memory; /**< size of device memory, e.g. VRAM */
+   unsigned avail_device_memory; /**< free device memory at the moment */
+   unsigned total_staging_memory; /**< size of staging memory, e.g. GART */
+   unsigned avail_staging_memory; /**< free staging memory at the moment */
+   unsigned device_memory_evicted; /**< size of memory evicted (monotonic counter) */
+   unsigned nr_device_memory_evictions; /**< # of evictions (monotonic counter) */
+};
 
 #ifdef DEBUG
 extern int MESA_VERBOSE;
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index 41f370ce485..b622d6a2979 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -288,16 +288,18 @@ void GLAPIENTRY
 _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr;
+   struct gl_sync_object *syncObj;
    const char *callerstr;
    char **labelPtr;
 
+   syncObj = _mesa_get_and_ref_sync(ctx, (void*)ptr, true);
+
    if (_mesa_is_desktop_gl(ctx))
       callerstr = "glObjectPtrLabel";
    else
       callerstr = "glObjectPtrLabelKHR";
 
-   if (!_mesa_validate_sync(ctx, syncObj)) {
+   if (!syncObj) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)",
                   callerstr);
       return;
@@ -306,6 +308,7 @@ _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label)
    labelPtr = &syncObj->Label;
 
    set_label(ctx, labelPtr, label, length, callerstr);
+   _mesa_unref_sync_object(ctx, syncObj, 1);
 }
 
 void GLAPIENTRY
@@ -313,7 +316,7 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
                         GLchar *label)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr;
+   struct gl_sync_object *syncObj;
    const char *callerstr;
    char **labelPtr;
 
@@ -328,7 +331,8 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
       return;
    }
 
-   if (!_mesa_validate_sync(ctx, syncObj)) {
+   syncObj = _mesa_get_and_ref_sync(ctx, (void*)ptr, true);
+   if (!syncObj) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)",
                   callerstr);
       return;
@@ -337,4 +341,5 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
    labelPtr = &syncObj->Label;
 
    copy_label(*labelPtr, label, length, bufSize);
+   _mesa_unref_sync_object(ctx, syncObj, 1);
 }
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 98366857f62..b86692a5f7e 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -23,6 +23,7 @@
  */
 
 
+#include "bufferobj.h"
 #include "glheader.h"
 #include "context.h"
 #include "enums.h"
@@ -732,14 +733,16 @@ _mesa_GetQueryiv(GLenum target, GLenum pname, GLint *params)
    _mesa_GetQueryIndexediv(target, 0, pname, params);
 }
 
-void GLAPIENTRY
-_mesa_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
+static void
+get_query_object(struct gl_context *ctx, const char *func,
+                 GLuint id, GLenum pname, GLenum ptype,
+                 struct gl_buffer_object *buf, intptr_t offset)
 {
    struct gl_query_object *q = NULL;
-   GET_CURRENT_CONTEXT(ctx);
+   uint64_t value;
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetQueryObjectiv(%u, %s)\n", id,
+      _mesa_debug(ctx, "%s(%u, %s)\n", func, id,
                   _mesa_enum_to_string(pname));
 
    if (id)
@@ -747,96 +750,114 @@ _mesa_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
 
    if (!q || q->Active || !q->EverBound) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetQueryObjectivARB(id=%d is invalid or active)", id);
+                  "%s(id=%d is invalid or active)", func, id);
       return;
    }
 
-   switch (pname) {
-      case GL_QUERY_RESULT_ARB:
-         if (!q->Ready)
-            ctx->Driver.WaitQuery(ctx, q);
-         /* if result is too large for returned type, clamp to max value */
-         if (q->Target == GL_ANY_SAMPLES_PASSED
-             || q->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE) {
-            if (q->Result)
-               *params = GL_TRUE;
-            else
-               *params = GL_FALSE;
-         } else {
-            if (q->Result > 0x7fffffff) {
-               *params = 0x7fffffff;
-            }
-            else {
-               *params = (GLint)q->Result;
-            }
-         }
-         break;
-      case GL_QUERY_RESULT_AVAILABLE_ARB:
-         if (!q->Ready)
-            ctx->Driver.CheckQuery( ctx, q );
-         *params = q->Ready;
-         break;
-      case GL_QUERY_TARGET:
-         *params = q->Target;
-         break;
-      default:
-         _mesa_error(ctx, GL_INVALID_ENUM, "glGetQueryObjectivARB(pname)");
+   if (buf && buf != ctx->Shared->NullBufferObj) {
+      bool is_64bit = ptype == GL_INT64_ARB ||
+         ptype == GL_UNSIGNED_INT64_ARB;
+      if (!ctx->Extensions.ARB_query_buffer_object) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(not supported)", func);
          return;
+      }
+      if (buf->Size < offset + 4 * (is_64bit ? 2 : 1)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(out of bounds)", func);
+         return;
+      }
+
+      switch (pname) {
+      case GL_QUERY_RESULT:
+      case GL_QUERY_RESULT_NO_WAIT:
+      case GL_QUERY_RESULT_AVAILABLE:
+      case GL_QUERY_TARGET:
+         ctx->Driver.StoreQueryResult(ctx, q, buf, offset, pname, ptype);
+         return;
+      }
+
+      /* fall through to get error below */
    }
+
+   switch (pname) {
+   case GL_QUERY_RESULT:
+      if (!q->Ready)
+         ctx->Driver.WaitQuery(ctx, q);
+      value = q->Result;
+      break;
+   case GL_QUERY_RESULT_NO_WAIT:
+      if (!ctx->Extensions.ARB_query_buffer_object)
+         goto invalid_enum;
+      ctx->Driver.CheckQuery(ctx, q);
+      if (!q->Ready)
+         return;
+      value = q->Result;
+      break;
+   case GL_QUERY_RESULT_AVAILABLE:
+      if (!q->Ready)
+         ctx->Driver.CheckQuery(ctx, q);
+      value = q->Ready;
+      break;
+   case GL_QUERY_TARGET:
+      value = q->Target;
+      break;
+   default:
+invalid_enum:
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)",
+                  func, _mesa_enum_to_string(pname));
+      return;
+   }
+
+   /* TODO: Have the driver be required to handle this fixup. */
+   if (q->Target == GL_ANY_SAMPLES_PASSED ||
+       q->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE)
+      value = !!value;
+
+   switch (ptype) {
+   case GL_INT: {
+      GLint *param = (GLint *)offset;
+      if (value > 0x7fffffff)
+         *param = 0x7fffffff;
+      else
+         *param = value;
+      break;
+   }
+   case GL_UNSIGNED_INT: {
+      GLuint *param = (GLuint *)offset;
+      if (value > 0xffffffff)
+         *param = 0xffffffff;
+      else
+         *param = value;
+      break;
+   }
+   case GL_INT64_ARB:
+   case GL_UNSIGNED_INT64_ARB: {
+      GLuint64EXT *param = (GLuint64EXT *)offset;
+      *param = value;
+      break;
+   }
+   default:
+      unreachable("unexpected ptype");
+   }
+}
+
+void GLAPIENTRY
+_mesa_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   get_query_object(ctx, "glGetQueryObjectiv",
+                    id, pname, GL_INT, ctx->QueryBuffer, (intptr_t)params);
 }
 
 
 void GLAPIENTRY
 _mesa_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params)
 {
-   struct gl_query_object *q = NULL;
    GET_CURRENT_CONTEXT(ctx);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetQueryObjectuiv(%u, %s)\n", id,
-                  _mesa_enum_to_string(pname));
-
-   if (id)
-      q = _mesa_lookup_query_object(ctx, id);
-
-   if (!q || q->Active || !q->EverBound) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetQueryObjectuivARB(id=%d is invalid or active)", id);
-      return;
-   }
-
-   switch (pname) {
-      case GL_QUERY_RESULT_ARB:
-         if (!q->Ready)
-            ctx->Driver.WaitQuery(ctx, q);
-         /* if result is too large for returned type, clamp to max value */
-         if (q->Target == GL_ANY_SAMPLES_PASSED
-             || q->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE) {
-            if (q->Result)
-               *params = GL_TRUE;
-            else
-               *params = GL_FALSE;
-         } else {
-            if (q->Result > 0xffffffff) {
-               *params = 0xffffffff;
-            }
-            else {
-               *params = (GLuint)q->Result;
-            }
-         }
-         break;
-      case GL_QUERY_RESULT_AVAILABLE_ARB:
-         if (!q->Ready)
-            ctx->Driver.CheckQuery( ctx, q );
-         *params = q->Ready;
-         break;
-      case GL_QUERY_TARGET:
-         *params = q->Target;
-         break;
-      default:
-         _mesa_error(ctx, GL_INVALID_ENUM, "glGetQueryObjectuivARB(pname)");
-         return;
-   }
+   get_query_object(ctx, "glGetQueryObjectuiv",
+                    id, pname, GL_UNSIGNED_INT,
+                    ctx->QueryBuffer, (intptr_t)params);
 }
 
 
@@ -846,40 +867,11 @@ _mesa_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params)
 void GLAPIENTRY
 _mesa_GetQueryObjecti64v(GLuint id, GLenum pname, GLint64EXT *params)
 {
-   struct gl_query_object *q = NULL;
    GET_CURRENT_CONTEXT(ctx);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetQueryObjecti64v(%u, %s)\n", id,
-                  _mesa_enum_to_string(pname));
-
-   if (id)
-      q = _mesa_lookup_query_object(ctx, id);
-
-   if (!q || q->Active || !q->EverBound) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetQueryObjectui64vARB(id=%d is invalid or active)", id);
-      return;
-   }
-
-   switch (pname) {
-      case GL_QUERY_RESULT_ARB:
-         if (!q->Ready)
-            ctx->Driver.WaitQuery(ctx, q);
-         *params = q->Result;
-         break;
-      case GL_QUERY_RESULT_AVAILABLE_ARB:
-         if (!q->Ready)
-            ctx->Driver.CheckQuery( ctx, q );
-         *params = q->Ready;
-         break;
-      case GL_QUERY_TARGET:
-         *params = q->Target;
-         break;
-      default:
-         _mesa_error(ctx, GL_INVALID_ENUM, "glGetQueryObjecti64vARB(pname)");
-         return;
-   }
+   get_query_object(ctx, "glGetQueryObjecti64v",
+                    id, pname, GL_INT64_ARB,
+                    ctx->QueryBuffer, (intptr_t)params);
 }
 
 
@@ -889,40 +881,11 @@ _mesa_GetQueryObjecti64v(GLuint id, GLenum pname, GLint64EXT *params)
 void GLAPIENTRY
 _mesa_GetQueryObjectui64v(GLuint id, GLenum pname, GLuint64EXT *params)
 {
-   struct gl_query_object *q = NULL;
    GET_CURRENT_CONTEXT(ctx);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetQueryObjectui64v(%u, %s)\n", id,
-                  _mesa_enum_to_string(pname));
-
-   if (id)
-      q = _mesa_lookup_query_object(ctx, id);
-
-   if (!q || q->Active || !q->EverBound) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetQueryObjectuui64vARB(id=%d is invalid or active)", id);
-      return;
-   }
-
-   switch (pname) {
-      case GL_QUERY_RESULT_ARB:
-         if (!q->Ready)
-            ctx->Driver.WaitQuery(ctx, q);
-         *params = q->Result;
-         break;
-      case GL_QUERY_RESULT_AVAILABLE_ARB:
-         if (!q->Ready)
-            ctx->Driver.CheckQuery( ctx, q );
-         *params = q->Ready;
-         break;
-      case GL_QUERY_TARGET:
-         *params = q->Target;
-         break;
-      default:
-         _mesa_error(ctx, GL_INVALID_ENUM, "glGetQueryObjectui64vARB(pname)");
-         return;
-   }
+   get_query_object(ctx, "glGetQueryObjectui64v",
+                    id, pname, GL_UNSIGNED_INT64_ARB,
+                    ctx->QueryBuffer, (intptr_t)params);
 }
 
 /**
@@ -932,8 +895,15 @@ void GLAPIENTRY
 _mesa_GetQueryBufferObjectiv(GLuint id, GLuint buffer, GLenum pname,
                              GLintptr offset)
 {
+   struct gl_buffer_object *buf;
    GET_CURRENT_CONTEXT(ctx);
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetQueryBufferObjectiv");
+
+   buf = _mesa_lookup_bufferobj_err(ctx, buffer, "glGetQueryBufferObjectiv");
+   if (!buf)
+      return;
+
+   get_query_object(ctx, "glGetQueryBufferObjectiv",
+                    id, pname, GL_INT, buf, offset);
 }
 
 
@@ -941,8 +911,15 @@ void GLAPIENTRY
 _mesa_GetQueryBufferObjectuiv(GLuint id, GLuint buffer, GLenum pname,
                               GLintptr offset)
 {
+   struct gl_buffer_object *buf;
    GET_CURRENT_CONTEXT(ctx);
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetQueryBufferObjectuiv");
+
+   buf = _mesa_lookup_bufferobj_err(ctx, buffer, "glGetQueryBufferObjectuiv");
+   if (!buf)
+      return;
+
+   get_query_object(ctx, "glGetQueryBufferObjectuiv",
+                    id, pname, GL_UNSIGNED_INT, buf, offset);
 }
 
 
@@ -950,8 +927,15 @@ void GLAPIENTRY
 _mesa_GetQueryBufferObjecti64v(GLuint id, GLuint buffer, GLenum pname,
                                GLintptr offset)
 {
+   struct gl_buffer_object *buf;
    GET_CURRENT_CONTEXT(ctx);
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetQueryBufferObjecti64v");
+
+   buf = _mesa_lookup_bufferobj_err(ctx, buffer, "glGetQueryBufferObjecti64v");
+   if (!buf)
+      return;
+
+   get_query_object(ctx, "glGetQueryBufferObjecti64v",
+                    id, pname, GL_INT64_ARB, buf, offset);
 }
 
 
@@ -959,8 +943,15 @@ void GLAPIENTRY
 _mesa_GetQueryBufferObjectui64v(GLuint id, GLuint buffer, GLenum pname,
                                 GLintptr offset)
 {
+   struct gl_buffer_object *buf;
    GET_CURRENT_CONTEXT(ctx);
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetQueryBufferObjectui64v");
+
+   buf = _mesa_lookup_bufferobj_err(ctx, buffer, "glGetQueryBufferObjectui64v");
+   if (!buf)
+      return;
+
+   get_query_object(ctx, "glGetQueryBufferObjectui64v",
+                    id, pname, GL_UNSIGNED_INT64_ARB, buf, offset);
 }
 
 
diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c
index c37b31d1753..b9f7bb65fb6 100644
--- a/src/mesa/main/shared.c
+++ b/src/mesa/main/shared.c
@@ -338,7 +338,7 @@ free_shared_state(struct gl_context *ctx, struct gl_shared_state *shared)
       struct set_entry *entry;
 
       set_foreach(shared->SyncObjects, entry) {
-         _mesa_unref_sync_object(ctx, (struct gl_sync_object *) entry->key);
+         _mesa_unref_sync_object(ctx, (struct gl_sync_object *) entry->key, 1);
       }
    }
    _mesa_set_destroy(shared->SyncObjects, NULL);
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 4043c4f2057..57f13411fdf 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -352,7 +352,7 @@ update_multisample(struct gl_context *ctx)
    ctx->Multisample._Enabled = GL_FALSE;
    if (ctx->Multisample.Enabled &&
        ctx->DrawBuffer &&
-       ctx->DrawBuffer->Visual.sampleBuffers)
+       _mesa_geometric_samples(ctx->DrawBuffer) > 0)
       ctx->Multisample._Enabled = GL_TRUE;
 }
 
diff --git a/src/mesa/main/syncobj.c b/src/mesa/main/syncobj.c
index c1b2d3bed54..be758dd1241 100644
--- a/src/mesa/main/syncobj.c
+++ b/src/mesa/main/syncobj.c
@@ -167,34 +167,42 @@ _mesa_free_sync_data(struct gl_context *ctx)
  *  - not in sync objects hash table
  *  - type is GL_SYNC_FENCE
  *  - not marked as deleted
+ *
+ * Returns the internal gl_sync_object pointer if the sync object is valid
+ * or NULL if it isn't.
+ *
+ * If "incRefCount" is true, the reference count is incremented, which is
+ * normally what you want; otherwise, a glDeleteSync from another thread
+ * could delete the sync object while you are still working on it.
  */
-bool
-_mesa_validate_sync(struct gl_context *ctx,
-                    const struct gl_sync_object *syncObj)
+struct gl_sync_object *
+_mesa_get_and_ref_sync(struct gl_context *ctx, GLsync sync, bool incRefCount)
 {
-   return (syncObj != NULL)
+   struct gl_sync_object *syncObj = (struct gl_sync_object *) sync;
+   mtx_lock(&ctx->Shared->Mutex);
+   if (syncObj != NULL
       && _mesa_set_search(ctx->Shared->SyncObjects, syncObj) != NULL
       && (syncObj->Type == GL_SYNC_FENCE)
-      && !syncObj->DeletePending;
-}
-
-
-void
-_mesa_ref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj)
-{
-   mtx_lock(&ctx->Shared->Mutex);
-   syncObj->RefCount++;
+      && !syncObj->DeletePending) {
+     if (incRefCount) {
+       syncObj->RefCount++;
+     }
+   } else {
+     syncObj = NULL;
+   }
    mtx_unlock(&ctx->Shared->Mutex);
+   return syncObj;
 }
 
 
 void
-_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj)
+_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj,
+                        int amount)
 {
    struct set_entry *entry;
 
    mtx_lock(&ctx->Shared->Mutex);
-   syncObj->RefCount--;
+   syncObj->RefCount -= amount;
    if (syncObj->RefCount == 0) {
       entry = _mesa_set_search(ctx->Shared->SyncObjects, syncObj);
       assert (entry != NULL);
@@ -212,10 +220,9 @@ GLboolean GLAPIENTRY
 _mesa_IsSync(GLsync sync)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_FALSE);
 
-   return _mesa_validate_sync(ctx, syncObj) ? GL_TRUE : GL_FALSE;
+   return _mesa_get_and_ref_sync(ctx, sync, false) ? GL_TRUE : GL_FALSE;
 }
 
 
@@ -223,7 +230,7 @@ void GLAPIENTRY
 _mesa_DeleteSync(GLsync sync)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
+   struct gl_sync_object *syncObj;
 
    /* From the GL_ARB_sync spec:
     *
@@ -235,16 +242,19 @@ _mesa_DeleteSync(GLsync sync)
       return;
    }
 
-   if (!_mesa_validate_sync(ctx, syncObj)) {
+   syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
+   if (!syncObj) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glDeleteSync (not a valid sync object)");
       return;
    }
 
    /* If there are no client-waits or server-waits pending on this sync, delete
-    * the underlying object.
+    * the underlying object. Note that we double-unref the object, as
+    * _mesa_get_and_ref_sync above took an extra refcount to make sure the pointer
+    * is valid for us to manipulate.
     */
    syncObj->DeletePending = GL_TRUE;
-   _mesa_unref_sync_object(ctx, syncObj);
+   _mesa_unref_sync_object(ctx, syncObj, 2);
 }
 
 
@@ -299,21 +309,20 @@ GLenum GLAPIENTRY
 _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
+   struct gl_sync_object *syncObj;
    GLenum ret;
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_WAIT_FAILED);
 
-   if (!_mesa_validate_sync(ctx, syncObj)) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)");
-      return GL_WAIT_FAILED;
-   }
-
    if ((flags & ~GL_SYNC_FLUSH_COMMANDS_BIT) != 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync(flags=0x%x)", flags);
       return GL_WAIT_FAILED;
    }
 
-   _mesa_ref_sync_object(ctx, syncObj);
+   syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
+   if (!syncObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)");
+      return GL_WAIT_FAILED;
+   }
 
    /* From the GL_ARB_sync spec:
     *
@@ -335,7 +344,7 @@ _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
       }
    }
 
-   _mesa_unref_sync_object(ctx, syncObj);
+   _mesa_unref_sync_object(ctx, syncObj, 1);
    return ret;
 }
 
@@ -344,12 +353,7 @@ void GLAPIENTRY
 _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
-
-   if (!_mesa_validate_sync(ctx, syncObj)) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync (not a valid sync object)");
-      return;
-   }
+   struct gl_sync_object *syncObj;
 
    if (flags != 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync(flags=0x%x)", flags);
@@ -362,7 +366,14 @@ _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
       return;
    }
 
+   syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
+   if (!syncObj) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync (not a valid sync object)");
+      return;
+   }
+
    ctx->Driver.ServerWaitSync(ctx, syncObj, flags, timeout);
+   _mesa_unref_sync_object(ctx, syncObj, 1);
 }
 
 
@@ -371,11 +382,12 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length,
 		GLint *values)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
+   struct gl_sync_object *syncObj;
    GLsizei size = 0;
    GLint v[1];
 
-   if (!_mesa_validate_sync(ctx, syncObj)) {
+   syncObj = _mesa_get_and_ref_sync(ctx, sync, true);
+   if (!syncObj) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glGetSynciv (not a valid sync object)");
       return;
    }
@@ -409,6 +421,7 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length,
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetSynciv(pname=0x%x)\n", pname);
+      _mesa_unref_sync_object(ctx, syncObj, 1);
       return;
    }
 
@@ -421,4 +434,6 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length,
    if (length != NULL) {
       *length = size;
    }
+
+   _mesa_unref_sync_object(ctx, syncObj, 1);
 }
diff --git a/src/mesa/main/syncobj.h b/src/mesa/main/syncobj.h
index 5d510e873a9..ea4a71222c0 100644
--- a/src/mesa/main/syncobj.h
+++ b/src/mesa/main/syncobj.h
@@ -47,15 +47,12 @@ _mesa_init_sync(struct gl_context *);
 extern void
 _mesa_free_sync_data(struct gl_context *);
 
-extern void
-_mesa_ref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj);
+struct gl_sync_object *
+_mesa_get_and_ref_sync(struct gl_context *ctx, GLsync sync, bool incRefCount);
 
 extern void
-_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj);
-
-extern bool
-_mesa_validate_sync(struct gl_context *ctx,
-                    const struct gl_sync_object *syncObj);
+_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj,
+                        int amount);
 
 extern GLboolean GLAPIENTRY
 _mesa_IsSync(GLsync sync);
diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h
index bb9729cdbde..eb274ad6540 100644
--- a/src/mesa/main/transformfeedback.h
+++ b/src/mesa/main/transformfeedback.h
@@ -145,6 +145,9 @@ _mesa_set_transform_feedback_binding(struct gl_context *ctx,
    tfObj->BufferNames[index]   = bufObj->Name;
    tfObj->Offset[index]        = offset;
    tfObj->RequestedSize[index] = size;
+
+   if (bufObj != ctx->Shared->NullBufferObj)
+      bufObj->UsageHistory |= USAGE_TRANSFORM_FEEDBACK_BUFFER;
 }
 
 /*** GL_ARB_direct_state_access ***/
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 3c51d18ed62..0f17ed136da 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2293,6 +2293,10 @@ add_uniform_to_shader::visit_field(const glsl_type *type, const char *name,
 
    (void) row_major;
 
+   /* atomics don't get real storage */
+   if (type->contains_atomic())
+      return;
+
    if (type->is_vector() || type->is_scalar()) {
       size = type->vector_elements;
       if (type->is_double())
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index e98946b9387..34183d4d95f 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -454,73 +454,3 @@ _mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list,
    *posOut = -1;
    return GL_FALSE;
 }
-
-
-struct gl_program_parameter_list *
-_mesa_clone_parameter_list(const struct gl_program_parameter_list *list)
-{
-   struct gl_program_parameter_list *clone;
-   GLuint i;
-
-   clone = _mesa_new_parameter_list();
-   if (!clone)
-      return NULL;
-
-   /** Not too efficient, but correct */
-   for (i = 0; i < list->NumParameters; i++) {
-      struct gl_program_parameter *p = list->Parameters + i;
-      struct gl_program_parameter *pCopy;
-      GLuint size = MIN2(p->Size, 4);
-      GLint j = _mesa_add_parameter(clone, p->Type, p->Name, size, p->DataType,
-                                    list->ParameterValues[i], NULL);
-      assert(j >= 0);
-      pCopy = clone->Parameters + j;
-      /* copy state indexes */
-      if (p->Type == PROGRAM_STATE_VAR) {
-         GLint k;
-         for (k = 0; k < STATE_LENGTH; k++) {
-            pCopy->StateIndexes[k] = p->StateIndexes[k];
-         }
-      }
-      else {
-         clone->Parameters[j].Size = p->Size;
-      }
-      
-   }
-
-   clone->StateFlags = list->StateFlags;
-
-   return clone;
-}
-
-
-/**
- * Return a new parameter list which is listA + listB.
- */
-struct gl_program_parameter_list *
-_mesa_combine_parameter_lists(const struct gl_program_parameter_list *listA,
-                              const struct gl_program_parameter_list *listB)
-{
-   struct gl_program_parameter_list *list;
-
-   if (listA) {
-      list = _mesa_clone_parameter_list(listA);
-      if (list && listB) {
-         GLuint i;
-         for (i = 0; i < listB->NumParameters; i++) {
-            struct gl_program_parameter *param = listB->Parameters + i;
-            _mesa_add_parameter(list, param->Type, param->Name, param->Size,
-                                param->DataType,
-                                listB->ParameterValues[i],
-                                param->StateIndexes);
-         }
-      }
-   }
-   else if (listB) {
-      list = _mesa_clone_parameter_list(listB);
-   }
-   else {
-      list = NULL;
-   }
-   return list;
-}
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index 44700b710d7..c04d7a2e634 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -99,13 +99,6 @@ _mesa_new_parameter_list_sized(unsigned size);
 extern void
 _mesa_free_parameter_list(struct gl_program_parameter_list *paramList);
 
-extern struct gl_program_parameter_list *
-_mesa_clone_parameter_list(const struct gl_program_parameter_list *list);
-
-extern struct gl_program_parameter_list *
-_mesa_combine_parameter_lists(const struct gl_program_parameter_list *a,
-                              const struct gl_program_parameter_list *b);
-
 static inline GLuint
 _mesa_num_parameters(const struct gl_program_parameter_list *list)
 {
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index 12490d0c380..eed241271df 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -40,6 +40,7 @@
 #include "prog_statevars.h"
 #include "prog_parameter.h"
 #include "main/samplerobj.h"
+#include "framebuffer.h"
 
 
 #define ONE_DIV_SQRT_LN2 (1.201122408786449815)
@@ -352,7 +353,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
       }
       return;
    case STATE_NUM_SAMPLES:
-      ((int *)value)[0] = ctx->DrawBuffer->Visual.samples;
+      ((int *)value)[0] = _mesa_geometric_samples(ctx->DrawBuffer);
       return;
    case STATE_DEPTH_RANGE:
       value[0] = ctx->ViewportArray[0].Near;                /* near       */
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index 0e78e6ab25d..27867c48d52 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -31,6 +31,7 @@
 
 #include "main/glheader.h"
 #include "main/context.h"
+#include "main/framebuffer.h"
 #include "main/hash.h"
 #include "main/macros.h"
 #include "program.h"
@@ -534,14 +535,14 @@ _mesa_get_min_invocations_per_fragment(struct gl_context *ctx,
        *  forces per-sample shading"
        */
       if (prog->IsSample && !ignore_sample_qualifier)
-         return MAX2(ctx->DrawBuffer->Visual.samples, 1);
+         return MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1);
 
       if (prog->Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID |
                                          SYSTEM_BIT_SAMPLE_POS))
-         return MAX2(ctx->DrawBuffer->Visual.samples, 1);
+         return MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1);
       else if (ctx->Multisample.SampleShading)
          return MAX2(ceil(ctx->Multisample.MinSampleShadingValue *
-                          ctx->DrawBuffer->Visual.samples), 1);
+                          _mesa_geometric_samples(ctx->DrawBuffer)), 1);
       else
          return 1;
    }
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 03097225bb2..4b89ade1b15 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -75,6 +75,16 @@ static const struct st_tracked_state *atoms[] =
    &st_bind_tes_ubos,
    &st_bind_fs_ubos,
    &st_bind_gs_ubos,
+   &st_bind_vs_atomics,
+   &st_bind_tcs_atomics,
+   &st_bind_tes_atomics,
+   &st_bind_fs_atomics,
+   &st_bind_gs_atomics,
+   &st_bind_vs_ssbos,
+   &st_bind_tcs_ssbos,
+   &st_bind_tes_ssbos,
+   &st_bind_fs_ssbos,
+   &st_bind_gs_ssbos,
    &st_update_pixel_transfer,
    &st_update_tess,
 
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index a24842baa4f..3a9153c80cb 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -78,6 +78,16 @@ extern const struct st_tracked_state st_bind_vs_ubos;
 extern const struct st_tracked_state st_bind_gs_ubos;
 extern const struct st_tracked_state st_bind_tcs_ubos;
 extern const struct st_tracked_state st_bind_tes_ubos;
+extern const struct st_tracked_state st_bind_fs_atomics;
+extern const struct st_tracked_state st_bind_vs_atomics;
+extern const struct st_tracked_state st_bind_gs_atomics;
+extern const struct st_tracked_state st_bind_tcs_atomics;
+extern const struct st_tracked_state st_bind_tes_atomics;
+extern const struct st_tracked_state st_bind_fs_ssbos;
+extern const struct st_tracked_state st_bind_vs_ssbos;
+extern const struct st_tracked_state st_bind_gs_ssbos;
+extern const struct st_tracked_state st_bind_tcs_ssbos;
+extern const struct st_tracked_state st_bind_tes_ssbos;
 extern const struct st_tracked_state st_update_pixel_transfer;
 extern const struct st_tracked_state st_update_tess;
 
diff --git a/src/mesa/state_tracker/st_atom_atomicbuf.c b/src/mesa/state_tracker/st_atom_atomicbuf.c
new file mode 100644
index 00000000000..1c30d1fb701
--- /dev/null
+++ b/src/mesa/state_tracker/st_atom_atomicbuf.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ *
+ * Copyright 2014 Ilia Mirkin. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "main/imports.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "compiler/glsl/ir_uniform.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+
+#include "st_debug.h"
+#include "st_cb_bufferobjects.h"
+#include "st_context.h"
+#include "st_atom.h"
+#include "st_program.h"
+
+static void
+st_bind_atomics(struct st_context *st,
+                struct gl_shader_program *prog,
+                unsigned shader_type)
+{
+   unsigned i;
+
+   if (!prog || !st->pipe->set_shader_buffers)
+      return;
+
+   for (i = 0; i < prog->NumAtomicBuffers; i++) {
+      struct gl_active_atomic_buffer *atomic = &prog->AtomicBuffers[i];
+      struct gl_atomic_buffer_binding *binding =
+         &st->ctx->AtomicBufferBindings[atomic->Binding];
+      struct st_buffer_object *st_obj =
+         st_buffer_object(binding->BufferObject);
+      struct pipe_shader_buffer sb = { 0 };
+
+      sb.buffer = st_obj->buffer;
+      sb.buffer_offset = binding->Offset;
+      sb.buffer_size = st_obj->buffer->width0 - binding->Offset;
+
+      st->pipe->set_shader_buffers(st->pipe, shader_type,
+                                   atomic->Binding, 1, &sb);
+   }
+}
+
+static void
+bind_vs_atomics(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+
+   st_bind_atomics(st, prog, PIPE_SHADER_VERTEX);
+}
+
+const struct st_tracked_state st_bind_vs_atomics = {
+   "st_bind_vs_atomics",
+   {
+      0,
+      ST_NEW_VERTEX_PROGRAM | ST_NEW_ATOMIC_BUFFER,
+   },
+   bind_vs_atomics
+};
+
+static void
+bind_fs_atomics(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
+
+   st_bind_atomics(st, prog, PIPE_SHADER_FRAGMENT);
+}
+
+const struct st_tracked_state st_bind_fs_atomics = {
+   "st_bind_fs_atomics",
+   {
+      0,
+      ST_NEW_FRAGMENT_PROGRAM | ST_NEW_ATOMIC_BUFFER,
+   },
+   bind_fs_atomics
+};
+
+static void
+bind_gs_atomics(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
+
+   st_bind_atomics(st, prog, PIPE_SHADER_GEOMETRY);
+}
+
+const struct st_tracked_state st_bind_gs_atomics = {
+   "st_bind_gs_atomics",
+   {
+      0,
+      ST_NEW_GEOMETRY_PROGRAM | ST_NEW_ATOMIC_BUFFER,
+   },
+   bind_gs_atomics
+};
+
+static void
+bind_tcs_atomics(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+
+   st_bind_atomics(st, prog, PIPE_SHADER_TESS_CTRL);
+}
+
+const struct st_tracked_state st_bind_tcs_atomics = {
+   "st_bind_tcs_atomics",
+   {
+      0,
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_ATOMIC_BUFFER,
+   },
+   bind_tcs_atomics
+};
+
+static void
+bind_tes_atomics(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+   st_bind_atomics(st, prog, PIPE_SHADER_TESS_EVAL);
+}
+
+const struct st_tracked_state st_bind_tes_atomics = {
+   "st_bind_tes_atomics",
+   {
+      0,
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_ATOMIC_BUFFER,
+   },
+   bind_tes_atomics
+};
diff --git a/src/mesa/state_tracker/st_atom_storagebuf.c b/src/mesa/state_tracker/st_atom_storagebuf.c
new file mode 100644
index 00000000000..f165cc3e0a1
--- /dev/null
+++ b/src/mesa/state_tracker/st_atom_storagebuf.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ *
+ * Copyright 2014 Ilia Mirkin. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "main/imports.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "compiler/glsl/ir_uniform.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+
+#include "st_debug.h"
+#include "st_cb_bufferobjects.h"
+#include "st_context.h"
+#include "st_atom.h"
+#include "st_program.h"
+
+static void
+st_bind_ssbos(struct st_context *st, struct gl_shader *shader,
+              unsigned shader_type)
+{
+   unsigned i;
+   struct pipe_shader_buffer buffers[MAX_SHADER_STORAGE_BUFFERS];
+   struct gl_program_constants *c;
+
+   if (!shader || !st->pipe->set_shader_buffers)
+      return;
+
+   c = &st->ctx->Const.Program[shader->Stage];
+
+   for (i = 0; i < shader->NumShaderStorageBlocks; i++) {
+      struct gl_shader_storage_buffer_binding *binding;
+      struct st_buffer_object *st_obj;
+      struct pipe_shader_buffer *sb = &buffers[i];
+
+      binding = &st->ctx->ShaderStorageBufferBindings[
+            shader->ShaderStorageBlocks[i]->Binding];
+      st_obj = st_buffer_object(binding->BufferObject);
+
+      sb->buffer = st_obj->buffer;
+
+      if (sb->buffer) {
+         sb->buffer_offset = binding->Offset;
+         sb->buffer_size = sb->buffer->width0 - binding->Offset;
+
+         /* AutomaticSize is FALSE if the buffer was set with BindBufferRange.
+          * Take the minimum just to be sure.
+          */
+         if (!binding->AutomaticSize)
+            sb->buffer_size = MIN2(sb->buffer_size, (unsigned) binding->Size);
+      }
+      else {
+         sb->buffer_offset = 0;
+         sb->buffer_size = 0;
+      }
+   }
+   st->pipe->set_shader_buffers(st->pipe, shader_type, c->MaxAtomicBuffers,
+                                shader->NumShaderStorageBlocks, buffers);
+   /* clear out any stale shader buffers */
+   if (shader->NumShaderStorageBlocks < c->MaxShaderStorageBlocks)
+      st->pipe->set_shader_buffers(
+            st->pipe, shader_type,
+            c->MaxAtomicBuffers + shader->NumShaderStorageBlocks,
+            c->MaxShaderStorageBlocks - shader->NumShaderStorageBlocks,
+            NULL);
+}
+
+static void bind_vs_ssbos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+
+   if (!prog)
+      return;
+
+   st_bind_ssbos(st, prog->_LinkedShaders[MESA_SHADER_VERTEX],
+                 PIPE_SHADER_VERTEX);
+}
+
+const struct st_tracked_state st_bind_vs_ssbos = {
+   "st_bind_vs_ssbos",
+   {
+      0,
+      ST_NEW_VERTEX_PROGRAM | ST_NEW_STORAGE_BUFFER,
+   },
+   bind_vs_ssbos
+};
+
+static void bind_fs_ssbos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT];
+
+   if (!prog)
+      return;
+
+   st_bind_ssbos(st, prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
+                 PIPE_SHADER_FRAGMENT);
+}
+
+const struct st_tracked_state st_bind_fs_ssbos = {
+   "st_bind_fs_ssbos",
+   {
+      0,
+      ST_NEW_FRAGMENT_PROGRAM | ST_NEW_STORAGE_BUFFER,
+   },
+   bind_fs_ssbos
+};
+
+static void bind_gs_ssbos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
+
+   if (!prog)
+      return;
+
+   st_bind_ssbos(st, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
+                 PIPE_SHADER_GEOMETRY);
+}
+
+const struct st_tracked_state st_bind_gs_ssbos = {
+   "st_bind_gs_ssbos",
+   {
+      0,
+      ST_NEW_GEOMETRY_PROGRAM | ST_NEW_STORAGE_BUFFER,
+   },
+   bind_gs_ssbos
+};
+
+static void bind_tcs_ssbos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+
+   if (!prog)
+      return;
+
+   st_bind_ssbos(st, prog->_LinkedShaders[MESA_SHADER_TESS_CTRL],
+                 PIPE_SHADER_TESS_CTRL);
+}
+
+const struct st_tracked_state st_bind_tcs_ssbos = {
+   "st_bind_tcs_ssbos",
+   {
+      0,
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_STORAGE_BUFFER,
+   },
+   bind_tcs_ssbos
+};
+
+static void bind_tes_ssbos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+   if (!prog)
+      return;
+
+   st_bind_ssbos(st, prog->_LinkedShaders[MESA_SHADER_TESS_EVAL],
+                 PIPE_SHADER_TESS_EVAL);
+}
+
+const struct st_tracked_state st_bind_tes_ssbos = {
+   "st_bind_tes_ssbos",
+   {
+      0,
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_STORAGE_BUFFER,
+   },
+   bind_tes_ssbos
+};
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 68be8ba64ac..202b4eeeefa 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -237,6 +237,13 @@ st_bufferobj_data(struct gl_context *ctx,
    case GL_PARAMETER_BUFFER_ARB:
       bind = PIPE_BIND_COMMAND_ARGS_BUFFER;
       break;
+   case GL_ATOMIC_COUNTER_BUFFER:
+   case GL_SHADER_STORAGE_BUFFER:
+      bind = PIPE_BIND_SHADER_BUFFER;
+      break;
+   case GL_QUERY_BUFFER:
+      bind = PIPE_BIND_QUERY_BUFFER;
+      break;
    default:
       bind = 0;
    }
diff --git a/src/mesa/state_tracker/st_cb_queryobj.c b/src/mesa/state_tracker/st_cb_queryobj.c
index aafae16b2df..fc239bc778c 100644
--- a/src/mesa/state_tracker/st_cb_queryobj.c
+++ b/src/mesa/state_tracker/st_cb_queryobj.c
@@ -39,9 +39,11 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
+#include "util/u_inlines.h"
 #include "st_context.h"
 #include "st_cb_queryobj.h"
 #include "st_cb_bitmap.h"
+#include "st_cb_bufferobjects.h"
 
 
 static struct gl_query_object *
@@ -271,7 +273,7 @@ st_WaitQuery(struct gl_context *ctx, struct gl_query_object *q)
    {
       /* nothing */
    }
-			    
+
    q->Ready = GL_TRUE;
 }
 
@@ -303,6 +305,98 @@ st_GetTimestamp(struct gl_context *ctx)
    }
 }
 
+static void
+st_StoreQueryResult(struct gl_context *ctx, struct gl_query_object *q,
+                    struct gl_buffer_object *buf, intptr_t offset,
+                    GLenum pname, GLenum ptype)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct st_query_object *stq = st_query_object(q);
+   struct st_buffer_object *stObj = st_buffer_object(buf);
+   boolean wait = pname == GL_QUERY_RESULT;
+   enum pipe_query_value_type result_type;
+   int index;
+
+   /* GL_QUERY_TARGET is a bit of an extension since it has nothing to
+    * do with the GPU end of the query. Write it in "by hand".
+    */
+   if (pname == GL_QUERY_TARGET) {
+      /* Assume that the data must be LE. The endianness situation wrt CPU and
+       * GPU is incredibly confusing, but the vast majority of GPUs are
+       * LE. When a BE one comes along, this needs some form of resolution.
+       */
+      unsigned data[2] = { CPU_TO_LE32(q->Target), 0 };
+      pipe_buffer_write(pipe, stObj->buffer, offset,
+                        (ptype == GL_INT64_ARB ||
+                         ptype == GL_UNSIGNED_INT64_ARB) ? 8 : 4,
+                        data);
+      return;
+   }
+
+   switch (ptype) {
+   case GL_INT:
+      result_type = PIPE_QUERY_TYPE_I32;
+      break;
+   case GL_UNSIGNED_INT:
+      result_type = PIPE_QUERY_TYPE_U32;
+      break;
+   case GL_INT64_ARB:
+      result_type = PIPE_QUERY_TYPE_I64;
+      break;
+   case GL_UNSIGNED_INT64_ARB:
+      result_type = PIPE_QUERY_TYPE_U64;
+      break;
+   default:
+      unreachable("Unexpected result type");
+   }
+
+   if (pname == GL_QUERY_RESULT_AVAILABLE) {
+      index = -1;
+   } else if (stq->type == PIPE_QUERY_PIPELINE_STATISTICS) {
+      switch (q->Target) {
+      case GL_VERTICES_SUBMITTED_ARB:
+         index = 0;
+         break;
+      case GL_PRIMITIVES_SUBMITTED_ARB:
+         index = 1;
+         break;
+      case GL_VERTEX_SHADER_INVOCATIONS_ARB:
+         index = 2;
+         break;
+      case GL_GEOMETRY_SHADER_INVOCATIONS:
+         index = 3;
+         break;
+      case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB:
+         index = 4;
+         break;
+      case GL_CLIPPING_INPUT_PRIMITIVES_ARB:
+         index = 5;
+         break;
+      case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB:
+         index = 6;
+         break;
+      case GL_FRAGMENT_SHADER_INVOCATIONS_ARB:
+         index = 7;
+         break;
+      case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
+         index = 8;
+         break;
+      case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
+         index = 9;
+         break;
+      case GL_COMPUTE_SHADER_INVOCATIONS_ARB:
+         index = 10;
+         break;
+      default:
+         unreachable("Unexpected target");
+      }
+   } else {
+      index = 0;
+   }
+
+   pipe->get_query_result_resource(pipe, stq->pq, wait, result_type, index,
+                                   stObj->buffer, offset);
+}
 
 void st_init_query_functions(struct dd_function_table *functions)
 {
@@ -313,4 +407,5 @@ void st_init_query_functions(struct dd_function_table *functions)
    functions->WaitQuery = st_WaitQuery;
    functions->CheckQuery = st_CheckQuery;
    functions->GetTimestamp = st_GetTimestamp;
+   functions->StoreQueryResult = st_StoreQueryResult;
 }
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 0ceb37027e1..f2b607c3a1d 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -60,6 +60,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_tile.h"
 #include "util/u_format.h"
@@ -67,6 +68,9 @@
 #include "util/u_sampler.h"
 #include "util/u_math.h"
 #include "util/u_box.h"
+#include "util/u_simple_shaders.h"
+#include "cso_cache/cso_context.h"
+#include "tgsi/tgsi_ureg.h"
 
 #define DBG if (0) printf
 
@@ -686,6 +690,999 @@ st_get_blit_mask(GLenum srcFormat, GLenum dstFormat)
    }
 }
 
+void
+st_init_pbo_upload(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+
+   st->pbo_upload.enabled =
+      screen->get_param(screen, PIPE_CAP_TEXTURE_BUFFER_OBJECTS) &&
+      screen->get_param(screen, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT) >= 1 &&
+      screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_INTEGERS);
+   if (!st->pbo_upload.enabled)
+      return;
+
+   st->pbo_upload.rgba_only =
+      screen->get_param(screen, PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY);
+
+   if (screen->get_param(screen, PIPE_CAP_TGSI_INSTANCEID)) {
+      if (screen->get_param(screen, PIPE_CAP_TGSI_VS_LAYER_VIEWPORT)) {
+         st->pbo_upload.upload_layers = true;
+      } else if (screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES) >= 3) {
+         st->pbo_upload.upload_layers = true;
+         st->pbo_upload.use_gs = true;
+      }
+   }
+
+   /* Blend state */
+   memset(&st->pbo_upload.blend, 0, sizeof(struct pipe_blend_state));
+   st->pbo_upload.blend.rt[0].colormask = PIPE_MASK_RGBA;
+
+   /* Rasterizer state */
+   memset(&st->pbo_upload.raster, 0, sizeof(struct pipe_rasterizer_state));
+   st->pbo_upload.raster.half_pixel_center = 1;
+}
+
+void
+st_destroy_pbo_upload(struct st_context *st)
+{
+   if (st->pbo_upload.fs) {
+      cso_delete_fragment_shader(st->cso_context, st->pbo_upload.fs);
+      st->pbo_upload.fs = NULL;
+   }
+
+   if (st->pbo_upload.gs) {
+      cso_delete_geometry_shader(st->cso_context, st->pbo_upload.gs);
+      st->pbo_upload.gs = NULL;
+   }
+
+   if (st->pbo_upload.vs) {
+      cso_delete_vertex_shader(st->cso_context, st->pbo_upload.vs);
+      st->pbo_upload.vs = NULL;
+   }
+}
+
+/**
+ * Converts format to a format with the same components, types
+ * and sizes, but with the components in RGBA order.
+ */
+static enum pipe_format
+unswizzle_format(enum pipe_format format)
+{
+   switch (format)
+   {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_B10G10R10A2_UNORM:
+      return PIPE_FORMAT_R10G10B10A2_UNORM;
+
+   case PIPE_FORMAT_B10G10R10A2_SNORM:
+      return PIPE_FORMAT_R10G10B10A2_SNORM;
+
+   case PIPE_FORMAT_B10G10R10A2_UINT:
+      return PIPE_FORMAT_R10G10B10A2_UINT;
+
+   default:
+      return format;
+   }
+}
+
+/**
+ * Converts PIPE_FORMAT_A* to PIPE_FORMAT_R*.
+ */
+static enum pipe_format
+alpha_to_red(enum pipe_format format)
+{
+   switch (format)
+   {
+   case PIPE_FORMAT_A8_UNORM:
+      return PIPE_FORMAT_R8_UNORM;
+   case PIPE_FORMAT_A8_SNORM:
+      return PIPE_FORMAT_R8_SNORM;
+   case PIPE_FORMAT_A8_UINT:
+      return PIPE_FORMAT_R8_UINT;
+   case PIPE_FORMAT_A8_SINT:
+      return PIPE_FORMAT_R8_SINT;
+
+   case PIPE_FORMAT_A16_UNORM:
+      return PIPE_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_A16_SNORM:
+      return PIPE_FORMAT_R16_SNORM;
+   case PIPE_FORMAT_A16_UINT:
+      return PIPE_FORMAT_R16_UINT;
+   case PIPE_FORMAT_A16_SINT:
+      return PIPE_FORMAT_R16_SINT;
+   case PIPE_FORMAT_A16_FLOAT:
+      return PIPE_FORMAT_R16_FLOAT;
+
+   case PIPE_FORMAT_A32_UINT:
+      return PIPE_FORMAT_R32_UINT;
+   case PIPE_FORMAT_A32_SINT:
+      return PIPE_FORMAT_R32_SINT;
+   case PIPE_FORMAT_A32_FLOAT:
+      return PIPE_FORMAT_R32_FLOAT;
+
+   default:
+      return format;
+   }
+}
+
+/**
+ * Converts PIPE_FORMAT_R*A* to PIPE_FORMAT_R*G*.
+ */
+static enum pipe_format
+red_alpha_to_red_green(enum pipe_format format)
+{
+   switch (format)
+   {
+   case PIPE_FORMAT_R8A8_UNORM:
+      return PIPE_FORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8A8_SNORM:
+      return PIPE_FORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8A8_UINT:
+      return PIPE_FORMAT_R8G8_UINT;
+   case PIPE_FORMAT_R8A8_SINT:
+      return PIPE_FORMAT_R8G8_SINT;
+
+   case PIPE_FORMAT_R16A16_UNORM:
+      return PIPE_FORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16A16_SNORM:
+      return PIPE_FORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16A16_UINT:
+      return PIPE_FORMAT_R16G16_UINT;
+   case PIPE_FORMAT_R16A16_SINT:
+      return PIPE_FORMAT_R16G16_SINT;
+   case PIPE_FORMAT_R16A16_FLOAT:
+      return PIPE_FORMAT_R16G16_FLOAT;
+
+   case PIPE_FORMAT_R32A32_UINT:
+      return PIPE_FORMAT_R32G32_UINT;
+   case PIPE_FORMAT_R32A32_SINT:
+      return PIPE_FORMAT_R32G32_SINT;
+   case PIPE_FORMAT_R32A32_FLOAT:
+      return PIPE_FORMAT_R32G32_FLOAT;
+
+   default:
+       return format;
+   }
+}
+
+/**
+ * Converts PIPE_FORMAT_L*A* to PIPE_FORMAT_R*G*.
+ */
+static enum pipe_format
+luminance_alpha_to_red_green(enum pipe_format format)
+{
+   switch (format)
+   {
+   case PIPE_FORMAT_L8A8_UNORM:
+      return PIPE_FORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_L8A8_SNORM:
+      return PIPE_FORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_L8A8_UINT:
+      return PIPE_FORMAT_R8G8_UINT;
+   case PIPE_FORMAT_L8A8_SINT:
+      return PIPE_FORMAT_R8G8_SINT;
+
+   case PIPE_FORMAT_L16A16_UNORM:
+      return PIPE_FORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_L16A16_SNORM:
+      return PIPE_FORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_L16A16_UINT:
+      return PIPE_FORMAT_R16G16_UINT;
+   case PIPE_FORMAT_L16A16_SINT:
+      return PIPE_FORMAT_R16G16_SINT;
+   case PIPE_FORMAT_L16A16_FLOAT:
+      return PIPE_FORMAT_R16G16_FLOAT;
+
+   case PIPE_FORMAT_L32A32_UINT:
+      return PIPE_FORMAT_R32G32_UINT;
+   case PIPE_FORMAT_L32A32_SINT:
+      return PIPE_FORMAT_R32G32_SINT;
+   case PIPE_FORMAT_L32A32_FLOAT:
+      return PIPE_FORMAT_R32G32_FLOAT;
+
+   default:
+       return format;
+   }
+}
+
+/**
+ * Returns true if format is a PIPE_FORMAT_A* format, and false otherwise.
+ */
+static bool
+format_is_alpha(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   if (desc->nr_channels == 1 &&
+       desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
+      return true;
+
+   return false;
+}
+
+/**
+ * Returns true if format is a PIPE_FORMAT_R* format, and false otherwise.
+ */
+static bool
+format_is_red(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   if (desc->nr_channels == 1 &&
+       desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_X &&
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1)
+      return true;
+
+   return false;
+}
+
+
+/**
+ * Returns true if format is a PIPE_FORMAT_L* format, and false otherwise.
+ */
+static bool
+format_is_luminance(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   if (desc->nr_channels == 1 &&
+       desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_X &&
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_X &&
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_X &&
+       desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1)
+      return true;
+
+   return false;
+}
+
+/**
+ * Returns true if format is a PIPE_FORMAT_R*A* format, and false otherwise.
+ */
+static bool
+format_is_red_alpha(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   if (desc->nr_channels == 2 &&
+       desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_X &&
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_Y)
+      return true;
+
+   return false;
+}
+
+static bool
+format_is_swizzled_rgba(enum pipe_format format)
+{
+    const struct util_format_description *desc = util_format_description(format);
+
+    if ((desc->swizzle[0] == TGSI_SWIZZLE_X || desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_0) &&
+        (desc->swizzle[1] == TGSI_SWIZZLE_Y || desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_0) &&
+        (desc->swizzle[2] == TGSI_SWIZZLE_Z || desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_0) &&
+        (desc->swizzle[3] == TGSI_SWIZZLE_W || desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1))
+       return false;
+
+    return true;
+}
+
+struct format_table
+{
+   unsigned char swizzle[4];
+   enum pipe_format format;
+};
+
+static const struct format_table table_8888_unorm[] = {
+   { { 0, 1, 2, 3 }, PIPE_FORMAT_R8G8B8A8_UNORM },
+   { { 2, 1, 0, 3 }, PIPE_FORMAT_B8G8R8A8_UNORM },
+   { { 3, 0, 1, 2 }, PIPE_FORMAT_A8R8G8B8_UNORM },
+   { { 3, 2, 1, 0 }, PIPE_FORMAT_A8B8G8R8_UNORM }
+};
+
+static const struct format_table table_1010102_unorm[] = {
+   { { 0, 1, 2, 3 }, PIPE_FORMAT_R10G10B10A2_UNORM },
+   { { 2, 1, 0, 3 }, PIPE_FORMAT_B10G10R10A2_UNORM }
+};
+
+static const struct format_table table_1010102_snorm[] = {
+   { { 0, 1, 2, 3 }, PIPE_FORMAT_R10G10B10A2_SNORM },
+   { { 2, 1, 0, 3 }, PIPE_FORMAT_B10G10R10A2_SNORM }
+};
+
+static const struct format_table table_1010102_uint[] = {
+   { { 0, 1, 2, 3 }, PIPE_FORMAT_R10G10B10A2_UINT },
+   { { 2, 1, 0, 3 }, PIPE_FORMAT_B10G10R10A2_UINT }
+};
+
+static enum pipe_format
+swizzle_format(enum pipe_format format, const int * const swizzle)
+{
+   unsigned i;
+
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      for (i = 0; i < ARRAY_SIZE(table_8888_unorm); i++) {
+         if (swizzle[0] == table_8888_unorm[i].swizzle[0] &&
+             swizzle[1] == table_8888_unorm[i].swizzle[1] &&
+             swizzle[2] == table_8888_unorm[i].swizzle[2] &&
+             swizzle[3] == table_8888_unorm[i].swizzle[3])
+            return table_8888_unorm[i].format;
+      }
+      break;
+
+   case PIPE_FORMAT_R10G10B10A2_UNORM:
+   case PIPE_FORMAT_B10G10R10A2_UNORM:
+      for (i = 0; i < ARRAY_SIZE(table_1010102_unorm); i++) {
+         if (swizzle[0] == table_1010102_unorm[i].swizzle[0] &&
+             swizzle[1] == table_1010102_unorm[i].swizzle[1] &&
+             swizzle[2] == table_1010102_unorm[i].swizzle[2] &&
+             swizzle[3] == table_1010102_unorm[i].swizzle[3])
+            return table_1010102_unorm[i].format;
+      }
+      break;
+
+   case PIPE_FORMAT_R10G10B10A2_SNORM:
+   case PIPE_FORMAT_B10G10R10A2_SNORM:
+      for (i = 0; i < ARRAY_SIZE(table_1010102_snorm); i++) {
+         if (swizzle[0] == table_1010102_snorm[i].swizzle[0] &&
+             swizzle[1] == table_1010102_snorm[i].swizzle[1] &&
+             swizzle[2] == table_1010102_snorm[i].swizzle[2] &&
+             swizzle[3] == table_1010102_snorm[i].swizzle[3])
+            return table_1010102_snorm[i].format;
+      }
+      break;
+
+   case PIPE_FORMAT_R10G10B10A2_UINT:
+   case PIPE_FORMAT_B10G10R10A2_UINT:
+      for (i = 0; i < ARRAY_SIZE(table_1010102_uint); i++) {
+         if (swizzle[0] == table_1010102_uint[i].swizzle[0] &&
+             swizzle[1] == table_1010102_uint[i].swizzle[1] &&
+             swizzle[2] == table_1010102_uint[i].swizzle[2] &&
+             swizzle[3] == table_1010102_uint[i].swizzle[3])
+            return table_1010102_uint[i].format;
+      }
+      break;
+
+   default:
+      break;
+   }
+
+   return PIPE_FORMAT_NONE;
+}
+
+static bool
+reinterpret_formats(enum pipe_format *src_format, enum pipe_format *dst_format)
+{
+   enum pipe_format src = *src_format;
+   enum pipe_format dst = *dst_format;
+
+   /* Note: dst_format has already been transformed from luminance/intensity
+    *       to red when this function is called.  The source format will never
+    *       be an intensity format, because GL_INTENSITY is not a legal value
+    *       for the format parameter in glTex(Sub)Image(). */
+
+   if (format_is_alpha(src)) {
+      if (!format_is_alpha(dst))
+         return false;
+
+      src = alpha_to_red(src);
+      dst = alpha_to_red(dst);
+   } else if (format_is_luminance(src)) {
+      if (!format_is_red(dst) && !format_is_red_alpha(dst))
+         return false;
+
+      src = util_format_luminance_to_red(src);
+   } else if (util_format_is_luminance_alpha(src)) {
+      src = luminance_alpha_to_red_green(src);
+
+      if (format_is_red_alpha(dst)) {
+         dst = red_alpha_to_red_green(dst);
+      } else if (!format_is_red(dst))
+         return false;
+   } else if (format_is_swizzled_rgba(src)) {
+      const struct util_format_description *src_desc = util_format_description(src);
+      const struct util_format_description *dst_desc = util_format_description(dst);
+      int swizzle[4];
+      unsigned i;
+
+      /* Make sure the format is an RGBA and not an RGBX format */
+      if (src_desc->nr_channels != 4 || src_desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1)
+         return false;
+
+      if (dst_desc->nr_channels != 4 || dst_desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1)
+         return false;
+
+      for (i = 0; i < 4; i++)
+         swizzle[i] = dst_desc->swizzle[src_desc->swizzle[i]];
+
+      dst = swizzle_format(dst, swizzle);
+      if (dst == PIPE_FORMAT_NONE)
+         return false;
+
+      src = unswizzle_format(src);
+   }
+
+   *src_format = src;
+   *dst_format = dst;
+   return true;
+}
+
+static void *
+create_pbo_upload_vs(struct st_context *st)
+{
+   struct ureg_program *ureg;
+   struct ureg_src in_pos;
+   struct ureg_src in_instanceid;
+   struct ureg_dst out_pos;
+   struct ureg_dst out_layer;
+
+   ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
+
+   in_pos = ureg_DECL_vs_input(ureg, TGSI_SEMANTIC_POSITION);
+
+   out_pos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
+
+   if (st->pbo_upload.upload_layers) {
+      in_instanceid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
+
+      if (!st->pbo_upload.use_gs)
+         out_layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+   }
+
+   /* out_pos = in_pos */
+   ureg_MOV(ureg, out_pos, in_pos);
+
+   if (st->pbo_upload.upload_layers) {
+      if (st->pbo_upload.use_gs) {
+         /* out_pos.z = i2f(gl_InstanceID) */
+         ureg_I2F(ureg, ureg_writemask(out_pos, TGSI_WRITEMASK_Z),
+                        ureg_scalar(in_instanceid, TGSI_SWIZZLE_X));
+      } else {
+         /* out_layer = gl_InstanceID */
+         ureg_MOV(ureg, out_layer, in_instanceid);
+      }
+   }
+
+   ureg_END(ureg);
+
+   return ureg_create_shader_and_destroy(ureg, st->pipe);
+}
+
+static void *
+create_pbo_upload_gs(struct st_context *st)
+{
+   static const int zero = 0;
+   struct ureg_program *ureg;
+   struct ureg_dst out_pos;
+   struct ureg_dst out_layer;
+   struct ureg_src in_pos;
+   struct ureg_src imm;
+   unsigned i;
+
+   ureg = ureg_create(TGSI_PROCESSOR_GEOMETRY);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, PIPE_PRIM_TRIANGLES);
+   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, PIPE_PRIM_TRIANGLE_STRIP);
+   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES, 3);
+
+   out_pos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
+   out_layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+
+   in_pos = ureg_DECL_input(ureg, TGSI_SEMANTIC_POSITION, 0, 0, 1);
+
+   imm = ureg_DECL_immediate_int(ureg, &zero, 1);
+
+   for (i = 0; i < 3; ++i) {
+      struct ureg_src in_pos_vertex = ureg_src_dimension(in_pos, i);
+
+      /* out_pos = in_pos[i] */
+      ureg_MOV(ureg, out_pos, in_pos_vertex);
+
+      /* out_layer.x = f2i(in_pos[i].z) */
+      ureg_F2I(ureg, ureg_writemask(out_layer, TGSI_WRITEMASK_X),
+                     ureg_scalar(in_pos_vertex, TGSI_SWIZZLE_Z));
+
+      ureg_EMIT(ureg, ureg_scalar(imm, TGSI_SWIZZLE_X));
+   }
+
+   ureg_END(ureg);
+
+   return ureg_create_shader_and_destroy(ureg, st->pipe);
+}
+
+static void *
+create_pbo_upload_fs(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct ureg_program *ureg;
+   struct ureg_dst out;
+   struct ureg_src sampler;
+   struct ureg_src pos;
+   struct ureg_src layer;
+   struct ureg_src const0;
+   struct ureg_dst temp0;
+
+   ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+   if (!ureg)
+      return NULL;
+
+   out     = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
+   sampler = ureg_DECL_sampler(ureg, 0);
+   if (screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
+      pos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+   } else {
+      pos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                               TGSI_INTERPOLATE_LINEAR);
+   }
+   if (st->pbo_upload.upload_layers) {
+      layer = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_LAYER, 0,
+                                       TGSI_INTERPOLATE_CONSTANT);
+   }
+   const0  = ureg_DECL_constant(ureg, 0);
+   temp0   = ureg_DECL_temporary(ureg);
+
+   /* Note: const0 = [ -xoffset + skip_pixels, -yoffset, stride, image_height ] */
+
+   /* temp0.xy = f2i(temp0.xy) */
+   ureg_F2I(ureg, ureg_writemask(temp0, TGSI_WRITEMASK_XY),
+                  ureg_swizzle(pos,
+                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y,
+                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
+
+   /* temp0.xy = temp0.xy + const0.xy */
+   ureg_UADD(ureg, ureg_writemask(temp0, TGSI_WRITEMASK_XY),
+                   ureg_swizzle(ureg_src(temp0),
+                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y,
+                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
+                   ureg_swizzle(const0,
+                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y,
+                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
+
+   /* temp0.x = const0.z * temp0.y + temp0.x */
+   ureg_UMAD(ureg, ureg_writemask(temp0, TGSI_WRITEMASK_X),
+                   ureg_scalar(const0, TGSI_SWIZZLE_Z),
+                   ureg_scalar(ureg_src(temp0), TGSI_SWIZZLE_Y),
+                   ureg_scalar(ureg_src(temp0), TGSI_SWIZZLE_X));
+
+   if (st->pbo_upload.upload_layers) {
+      /* temp0.x = const0.w * layer + temp0.x */
+      ureg_UMAD(ureg, ureg_writemask(temp0, TGSI_WRITEMASK_X),
+                      ureg_scalar(const0, TGSI_SWIZZLE_W),
+                      ureg_scalar(layer, TGSI_SWIZZLE_X),
+                      ureg_scalar(ureg_src(temp0), TGSI_SWIZZLE_X));
+   }
+
+   /* out = txf(sampler, temp0.x) */
+   ureg_TXF(ureg, out, TGSI_TEXTURE_BUFFER,
+                  ureg_scalar(ureg_src(temp0), TGSI_SWIZZLE_X),
+                  sampler);
+
+   ureg_release_temporary(ureg, temp0);
+
+   ureg_END(ureg);
+
+   return ureg_create_shader_and_destroy(ureg, pipe);
+}
+
+static bool
+try_pbo_upload_common(struct gl_context *ctx,
+                      struct pipe_surface *surface,
+                      int xoffset, int yoffset,
+                      unsigned upload_width, unsigned upload_height,
+                      struct pipe_resource *buffer,
+                      enum pipe_format src_format,
+                      intptr_t buf_offset,
+                      unsigned bytes_per_pixel,
+                      unsigned stride,
+                      unsigned image_height)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   unsigned depth = surface->u.tex.last_layer - surface->u.tex.first_layer + 1;
+   unsigned skip_pixels = 0;
+   bool success = false;
+
+   /* Check alignment. */
+   {
+      unsigned ofs = (buf_offset * bytes_per_pixel) % ctx->Const.TextureBufferOffsetAlignment;
+      if (ofs != 0) {
+         if (ofs % bytes_per_pixel != 0)
+            return false;
+
+         skip_pixels = ofs / bytes_per_pixel;
+         buf_offset -= skip_pixels;
+      }
+   }
+
+   /* Create the shaders */
+   if (!st->pbo_upload.vs) {
+      st->pbo_upload.vs = create_pbo_upload_vs(st);
+      if (!st->pbo_upload.vs)
+         return false;
+   }
+
+   if (depth != 1 && st->pbo_upload.use_gs && !st->pbo_upload.gs) {
+      st->pbo_upload.gs = create_pbo_upload_gs(st);
+      if (!st->pbo_upload.gs)
+         return false;
+   }
+
+   if (!st->pbo_upload.fs) {
+      st->pbo_upload.fs = create_pbo_upload_fs(st);
+      if (!st->pbo_upload.fs)
+         return false;
+   }
+
+   /* Set up the sampler_view */
+   {
+      unsigned first_element = buf_offset;
+      unsigned last_element = buf_offset + skip_pixels + upload_width - 1
+         + (upload_height - 1 + (depth - 1) * image_height) * stride;
+      struct pipe_sampler_view templ;
+      struct pipe_sampler_view *sampler_view;
+
+      /* This should be ensured by Mesa before calling our callbacks */
+      assert((last_element + 1) * bytes_per_pixel <= buffer->width0);
+
+      if (last_element - first_element > ctx->Const.MaxTextureBufferSize - 1)
+         return false;
+
+      memset(&templ, 0, sizeof(templ));
+      templ.format = src_format;
+      templ.u.buf.first_element = first_element;
+      templ.u.buf.last_element = last_element;
+      templ.swizzle_r = PIPE_SWIZZLE_RED;
+      templ.swizzle_g = PIPE_SWIZZLE_GREEN;
+      templ.swizzle_b = PIPE_SWIZZLE_BLUE;
+      templ.swizzle_a = PIPE_SWIZZLE_ALPHA;
+
+      sampler_view = pipe->create_sampler_view(pipe, buffer, &templ);
+      if (sampler_view == NULL)
+         return false;
+
+      cso_save_fragment_sampler_views(st->cso_context);
+      cso_set_sampler_views(st->cso_context, PIPE_SHADER_FRAGMENT, 1,
+                            &sampler_view);
+
+      pipe_sampler_view_reference(&sampler_view, NULL);
+   }
+
+   /* Upload vertices */
+   {
+      struct pipe_vertex_buffer vbo;
+      struct pipe_vertex_element velem;
+
+      float x0 = (float) xoffset / surface->width * 2.0f - 1.0f;
+      float y0 = (float) yoffset / surface->height * 2.0f - 1.0f;
+      float x1 = (float) (xoffset + upload_width) / surface->width * 2.0f - 1.0f;
+      float y1 = (float) (yoffset + upload_height) / surface->height * 2.0f - 1.0f;
+
+      float *verts = NULL;
+
+      vbo.user_buffer = NULL;
+      vbo.buffer = NULL;
+      vbo.stride = 2 * sizeof(float);
+
+      u_upload_alloc(st->uploader, 0, 8 * sizeof(float), 4,
+                     &vbo.buffer_offset, &vbo.buffer, (void **) &verts);
+      if (!verts)
+         goto fail_vertex_upload;
+
+      verts[0] = x0;
+      verts[1] = y0;
+      verts[2] = x0;
+      verts[3] = y1;
+      verts[4] = x1;
+      verts[5] = y0;
+      verts[6] = x1;
+      verts[7] = y1;
+
+      u_upload_unmap(st->uploader);
+
+      velem.src_offset = 0;
+      velem.instance_divisor = 0;
+      velem.vertex_buffer_index = cso_get_aux_vertex_buffer_slot(st->cso_context);
+      velem.src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+      cso_save_vertex_elements(st->cso_context);
+      cso_set_vertex_elements(st->cso_context, 1, &velem);
+
+      cso_save_aux_vertex_buffer_slot(st->cso_context);
+      cso_set_vertex_buffers(st->cso_context, velem.vertex_buffer_index,
+                             1, &vbo);
+
+      pipe_resource_reference(&vbo.buffer, NULL);
+   }
+
+   /* Upload constants */
+   {
+      struct pipe_constant_buffer cb;
+
+      struct {
+         int32_t xoffset;
+         int32_t yoffset;
+         int32_t stride;
+         int32_t image_size;
+      } constants;
+
+      constants.xoffset = -xoffset + skip_pixels;
+      constants.yoffset = -yoffset;
+      constants.stride = stride;
+      constants.image_size = stride * image_height;
+
+      if (st->constbuf_uploader) {
+         cb.buffer = NULL;
+         cb.user_buffer = NULL;
+         u_upload_data(st->constbuf_uploader, 0, sizeof(constants),
+                       st->ctx->Const.UniformBufferOffsetAlignment,
+                       &constants, &cb.buffer_offset, &cb.buffer);
+         if (!cb.buffer)
+            goto fail_constant_upload;
+
+         u_upload_unmap(st->constbuf_uploader);
+      } else {
+         cb.buffer = NULL;
+         cb.user_buffer = &constants;
+         cb.buffer_offset = 0;
+      }
+      cb.buffer_size = sizeof(constants);
+
+      cso_save_constant_buffer_slot0(st->cso_context, PIPE_SHADER_FRAGMENT);
+      cso_set_constant_buffer(st->cso_context, PIPE_SHADER_FRAGMENT, 0, &cb);
+
+      pipe_resource_reference(&cb.buffer, NULL);
+   }
+
+   /* Framebuffer_state */
+   {
+      struct pipe_framebuffer_state fb;
+      memset(&fb, 0, sizeof(fb));
+      fb.width = surface->width;
+      fb.height = surface->height;
+      fb.nr_cbufs = 1;
+      pipe_surface_reference(&fb.cbufs[0], surface);
+
+      cso_save_framebuffer(st->cso_context);
+      cso_set_framebuffer(st->cso_context, &fb);
+
+      pipe_surface_reference(&fb.cbufs[0], NULL);
+   }
+
+   /* Viewport state */
+   {
+      struct pipe_viewport_state vp;
+      vp.scale[0] = 0.5f * surface->width;
+      vp.scale[1] = 0.5f * surface->height;
+      vp.scale[2] = 1.0f;
+      vp.translate[0] = 0.5f * surface->width;
+      vp.translate[1] = 0.5f * surface->height;
+      vp.translate[2] = 0.0f;
+
+      cso_save_viewport(st->cso_context);
+      cso_set_viewport(st->cso_context, &vp);
+   }
+
+   /* Blend state */
+   cso_save_blend(st->cso_context);
+   cso_set_blend(st->cso_context, &st->pbo_upload.blend);
+
+   /* Rasterizer state */
+   cso_save_rasterizer(st->cso_context);
+   cso_set_rasterizer(st->cso_context, &st->pbo_upload.raster);
+
+   /* Set up the shaders */
+   cso_save_vertex_shader(st->cso_context);
+   cso_set_vertex_shader_handle(st->cso_context, st->pbo_upload.vs);
+
+   cso_save_geometry_shader(st->cso_context);
+   cso_set_geometry_shader_handle(st->cso_context,
+                                  depth != 1 ? st->pbo_upload.gs : NULL);
+
+   cso_save_tessctrl_shader(st->cso_context);
+   cso_set_tessctrl_shader_handle(st->cso_context, NULL);
+
+   cso_save_tesseval_shader(st->cso_context);
+   cso_set_tesseval_shader_handle(st->cso_context, NULL);
+
+   cso_save_fragment_shader(st->cso_context);
+   cso_set_fragment_shader_handle(st->cso_context, st->pbo_upload.fs);
+
+   /* Disable stream output */
+   cso_save_stream_outputs(st->cso_context);
+   cso_set_stream_outputs(st->cso_context, 0, NULL, 0);
+
+   if (depth == 1) {
+      cso_draw_arrays(st->cso_context, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+   } else {
+      cso_draw_arrays_instanced(st->cso_context, PIPE_PRIM_TRIANGLE_STRIP,
+                                0, 4, 0, depth);
+   }
+
+   success = true;
+
+   cso_restore_framebuffer(st->cso_context);
+   cso_restore_viewport(st->cso_context);
+   cso_restore_blend(st->cso_context);
+   cso_restore_rasterizer(st->cso_context);
+   cso_restore_vertex_shader(st->cso_context);
+   cso_restore_geometry_shader(st->cso_context);
+   cso_restore_tessctrl_shader(st->cso_context);
+   cso_restore_tesseval_shader(st->cso_context);
+   cso_restore_fragment_shader(st->cso_context);
+   cso_restore_stream_outputs(st->cso_context);
+   cso_restore_constant_buffer_slot0(st->cso_context, PIPE_SHADER_FRAGMENT);
+fail_constant_upload:
+   cso_restore_vertex_elements(st->cso_context);
+   cso_restore_aux_vertex_buffer_slot(st->cso_context);
+fail_vertex_upload:
+   cso_restore_fragment_sampler_views(st->cso_context);
+
+   return success;
+}
+
+static bool
+try_pbo_upload(struct gl_context *ctx, GLuint dims,
+               struct gl_texture_image *texImage,
+               GLenum format, GLenum type,
+               enum pipe_format dst_format,
+               GLint xoffset, GLint yoffset, GLint zoffset,
+               GLint width, GLint height, GLint depth,
+               const void *pixels,
+               const struct gl_pixelstore_attrib *unpack)
+{
+   struct st_context *st = st_context(ctx);
+   struct st_texture_image *stImage = st_texture_image(texImage);
+   struct st_texture_object *stObj = st_texture_object(texImage->TexObject);
+   struct pipe_resource *texture = stImage->pt;
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *surface = NULL;
+   enum pipe_format src_format;
+   const struct util_format_description *desc;
+   GLenum gl_target = texImage->TexObject->Target;
+   intptr_t buf_offset;
+   unsigned bytes_per_pixel;
+   unsigned stride, image_height;
+   bool success;
+
+   if (!st->pbo_upload.enabled)
+      return false;
+
+   /* From now on, we need the gallium representation of dimensions. */
+   if (gl_target == GL_TEXTURE_1D_ARRAY) {
+      depth = height;
+      height = 1;
+      zoffset = yoffset;
+      yoffset = 0;
+      image_height = 1;
+   } else {
+      image_height = unpack->ImageHeight > 0 ? unpack->ImageHeight : height;
+   }
+
+   if (depth != 1 && !st->pbo_upload.upload_layers)
+      return false;
+
+   /* Choose the source format. Initially, we do so without checking driver
+    * support at all because of the remapping we later perform and because
+    * at least the Radeon driver actually supports some formats for texture
+    * buffers which it doesn't support for regular textures. */
+   src_format = st_choose_matching_format(st, 0, format, type, unpack->SwapBytes);
+   if (!src_format) {
+      return false;
+   }
+
+   src_format = util_format_linear(src_format);
+   desc = util_format_description(src_format);
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return false;
+
+   if (desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB)
+      return false;
+
+   if (st->pbo_upload.rgba_only) {
+      enum pipe_format orig_dst_format = dst_format;
+
+      if (!reinterpret_formats(&src_format, &dst_format)) {
+         return false;
+      }
+
+      if (dst_format != orig_dst_format &&
+          !screen->is_format_supported(screen, dst_format, PIPE_TEXTURE_2D, 0,
+                                       PIPE_BIND_RENDER_TARGET)) {
+         return false;
+      }
+   }
+
+   if (!src_format ||
+       !screen->is_format_supported(screen, src_format, PIPE_BUFFER, 0,
+                                    PIPE_BIND_SAMPLER_VIEW)) {
+      return false;
+   }
+
+   /* Check if the offset satisfies the alignment requirements */
+   buf_offset = (intptr_t) pixels;
+   bytes_per_pixel = desc->block.bits / 8;
+
+   if (buf_offset % bytes_per_pixel) {
+      return false;
+   }
+
+   /* Convert to texels */
+   buf_offset = buf_offset / bytes_per_pixel;
+
+   /* Compute the stride, taking unpack->Alignment into account */
+   {
+       unsigned pixels_per_row = unpack->RowLength > 0 ?
+                           unpack->RowLength : width;
+       unsigned bytes_per_row = pixels_per_row * bytes_per_pixel;
+       unsigned remainder = bytes_per_row % unpack->Alignment;
+       unsigned offset_rows;
+
+       if (remainder > 0)
+          bytes_per_row += (unpack->Alignment - remainder);
+
+       if (bytes_per_row % bytes_per_pixel) {
+          return false;
+       }
+
+       stride = bytes_per_row / bytes_per_pixel;
+
+       offset_rows = unpack->SkipRows;
+       if (dims == 3)
+          offset_rows += image_height * unpack->SkipImages;
+
+       buf_offset += unpack->SkipPixels + stride * offset_rows;
+   }
+
+   /* Set up the surface */
+   {
+      unsigned level = stObj->pt != stImage->pt ? 0 : texImage->TexObject->MinLevel + texImage->Level;
+      unsigned max_layer = util_max_layer(texture, level);
+
+      zoffset += texImage->Face + texImage->TexObject->MinLayer;
+
+      struct pipe_surface templ;
+      memset(&templ, 0, sizeof(templ));
+      templ.format = dst_format;
+      templ.u.tex.level = level;
+      templ.u.tex.first_layer = MIN2(zoffset, max_layer);
+      templ.u.tex.last_layer = MIN2(zoffset + depth - 1, max_layer);
+
+      surface = pipe->create_surface(pipe, texture, &templ);
+      if (!surface)
+         return false;
+   }
+
+   success = try_pbo_upload_common(ctx,  surface,
+                                   xoffset, yoffset, width, height,
+                                   st_buffer_object(unpack->BufferObj)->buffer,
+                                   src_format,
+                                   buf_offset,
+                                   bytes_per_pixel, stride, image_height);
+
+   pipe_surface_reference(&surface, NULL);
+
+   return success;
+}
 
 static void
 st_TexSubImage(struct gl_context *ctx, GLuint dims,
@@ -735,21 +1732,15 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
       goto fallback;
    }
 
-   /* See if the texture format already matches the format and type,
-    * in which case the memcpy-based fast path will likely be used and
-    * we don't have to blit. */
-   if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
-                                            type, unpack->SwapBytes, NULL)) {
-      goto fallback;
-   }
 
+   /* See if the destination format is supported. */
    if (format == GL_DEPTH_COMPONENT || format == GL_DEPTH_STENCIL)
       bind = PIPE_BIND_DEPTH_STENCIL;
    else
       bind = PIPE_BIND_RENDER_TARGET;
 
-   /* See if the destination format is supported.
-    * For luminance and intensity, only the red channel is stored there. */
+   /* For luminance and intensity, only the red channel is stored
+    * in the destination. */
    dst_format = util_format_linear(dst->format);
    dst_format = util_format_luminance_to_red(dst_format);
    dst_format = util_format_intensity_to_red(dst_format);
@@ -760,6 +1751,21 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
       goto fallback;
    }
 
+   if (_mesa_is_bufferobj(unpack->BufferObj)) {
+      if (try_pbo_upload(ctx, dims, texImage, format, type, dst_format,
+                         xoffset, yoffset, zoffset,
+                         width, height, depth, pixels, unpack))
+         return;
+   }
+
+   /* See if the texture format already matches the format and type,
+    * in which case the memcpy-based fast path will likely be used and
+    * we don't have to blit. */
+   if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
+                                            type, unpack->SwapBytes, NULL)) {
+      goto fallback;
+   }
+
    /* Choose the source format. */
    src_format = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW,
                                           format, type, unpack->SwapBytes);
@@ -849,18 +1855,18 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
             /* 1D array textures.
              * We need to convert gallium coords to GL coords.
              */
-            GLvoid *src = _mesa_image_address3d(unpack, pixels,
+            GLvoid *src = _mesa_image_address2d(unpack, pixels,
                                                 width, depth, format,
-                                                type, 0, slice, 0);
+                                                type, slice, 0);
             memcpy(map, src, bytesPerRow);
          }
          else {
             ubyte *slice_map = map;
 
             for (row = 0; row < (unsigned) height; row++) {
-               GLvoid *src = _mesa_image_address3d(unpack, pixels,
-                                                   width, height, format,
-                                                   type, slice, row, 0);
+               GLvoid *src = _mesa_image_address(dims, unpack, pixels,
+                                                 width, height, format,
+                                                 type, slice, row, 0);
                memcpy(slice_map, src, bytesPerRow);
                slice_map += transfer->stride;
             }
@@ -927,13 +1933,166 @@ st_TexImage(struct gl_context * ctx, GLuint dims,
 }
 
 
+static void
+st_CompressedTexSubImage(struct gl_context *ctx, GLuint dims,
+                         struct gl_texture_image *texImage,
+                         GLint x, GLint y, GLint z,
+                         GLsizei w, GLsizei h, GLsizei d,
+                         GLenum format, GLsizei imageSize, const GLvoid *data)
+{
+   struct st_context *st = st_context(ctx);
+   struct st_texture_image *stImage = st_texture_image(texImage);
+   struct st_texture_object *stObj = st_texture_object(texImage->TexObject);
+   struct pipe_resource *texture = stImage->pt;
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_resource *dst = stImage->pt;
+   struct pipe_surface *surface = NULL;
+   struct compressed_pixelstore store;
+   enum pipe_format copy_format;
+   unsigned bytes_per_block;
+   unsigned bw, bh;
+   intptr_t buf_offset;
+   bool success = false;
+
+   /* Check basic pre-conditions for PBO upload */
+   if (!st->prefer_blit_based_texture_transfer) {
+      goto fallback;
+   }
+
+   if (!_mesa_is_bufferobj(ctx->Unpack.BufferObj))
+      goto fallback;
+
+   if ((_mesa_is_format_etc2(texImage->TexFormat) && !st->has_etc2) ||
+       (texImage->TexFormat == MESA_FORMAT_ETC1_RGB8 && !st->has_etc1)) {
+      /* ETC isn't supported and is represented by uncompressed formats. */
+      goto fallback;
+   }
+
+   if (!dst) {
+      goto fallback;
+   }
+
+   if (!st->pbo_upload.enabled ||
+       !screen->get_param(screen, PIPE_CAP_SURFACE_REINTERPRET_BLOCKS)) {
+      goto fallback;
+   }
+
+   /* Choose the pipe format for the upload. */
+   bytes_per_block = util_format_get_blocksize(dst->format);
+   bw = util_format_get_blockwidth(dst->format);
+   bh = util_format_get_blockheight(dst->format);
+
+   switch (bytes_per_block) {
+   case 8:
+      copy_format = PIPE_FORMAT_R16G16B16A16_UINT;
+      break;
+   case 16:
+      copy_format = PIPE_FORMAT_R32G32B32A32_UINT;
+      break;
+   default:
+      goto fallback;
+   }
+
+   if (!screen->is_format_supported(screen, copy_format, PIPE_BUFFER, 0,
+                                    PIPE_BIND_SAMPLER_VIEW)) {
+      goto fallback;
+   }
+
+   if (!screen->is_format_supported(screen, copy_format, dst->target,
+                                    dst->nr_samples, PIPE_BIND_RENDER_TARGET)) {
+      goto fallback;
+   }
+
+   /* Interpret the pixelstore settings. */
+   _mesa_compute_compressed_pixelstore(dims, texImage->TexFormat, w, h, d,
+                                       &ctx->Unpack, &store);
+   assert(store.CopyBytesPerRow % bytes_per_block == 0);
+   assert(store.SkipBytes % bytes_per_block == 0);
+
+   /* Compute the offset into the buffer */
+   buf_offset = (intptr_t)data + store.SkipBytes;
+
+   if (buf_offset % bytes_per_block) {
+      goto fallback;
+   }
+
+   buf_offset = buf_offset / bytes_per_block;
+
+   /* Set up the surface. */
+   {
+      unsigned level = stObj->pt != stImage->pt ? 0 : texImage->TexObject->MinLevel + texImage->Level;
+      unsigned max_layer = util_max_layer(texture, level);
+
+      z += texImage->Face + texImage->TexObject->MinLayer;
+
+      struct pipe_surface templ;
+      memset(&templ, 0, sizeof(templ));
+      templ.format = copy_format;
+      templ.u.tex.level = level;
+      templ.u.tex.first_layer = MIN2(z, max_layer);
+      templ.u.tex.last_layer = MIN2(z + d - 1, max_layer);
+
+      surface = pipe->create_surface(pipe, texture, &templ);
+      if (!surface)
+         goto fallback;
+   }
+
+   success = try_pbo_upload_common(ctx, surface,
+                                   x / bw, y / bh,
+                                   store.CopyBytesPerRow / bytes_per_block,
+                                   store.CopyRowsPerSlice,
+                                   st_buffer_object(ctx->Unpack.BufferObj)->buffer,
+                                   copy_format,
+                                   buf_offset,
+                                   bytes_per_block,
+                                   store.TotalBytesPerRow / bytes_per_block,
+                                   store.TotalRowsPerSlice);
+
+   pipe_surface_reference(&surface, NULL);
+
+   if (success)
+      return;
+
+fallback:
+   _mesa_store_compressed_texsubimage(ctx, dims, texImage,
+                                      x, y, z, w, h, d,
+                                      format, imageSize, data);
+}
+
 static void
 st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
                       struct gl_texture_image *texImage,
                       GLsizei imageSize, const GLvoid *data)
 {
    prep_teximage(ctx, texImage, GL_NONE, GL_NONE);
-   _mesa_store_compressed_teximage(ctx, dims, texImage, imageSize, data);
+
+   /* only 2D and 3D compressed images are supported at this time */
+   if (dims == 1) {
+      _mesa_problem(ctx, "Unexpected glCompressedTexImage1D call");
+      return;
+   }
+
+   /* This is pretty simple, because unlike the general texstore path we don't
+    * have to worry about the usual image unpacking or image transfer
+    * operations.
+    */
+   assert(texImage);
+   assert(texImage->Width > 0);
+   assert(texImage->Height > 0);
+   assert(texImage->Depth > 0);
+
+   /* allocate storage for texture data */
+   if (!st_AllocTextureImageBuffer(ctx, texImage)) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage%uD", dims);
+      return;
+   }
+
+   st_CompressedTexSubImage(ctx, dims, texImage,
+                            0, 0, 0,
+                            texImage->Width, texImage->Height, texImage->Depth,
+                            texImage->TexFormat,
+                            imageSize, data);
 }
 
 
@@ -1958,7 +3117,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->QuerySamplesForFormat = st_QuerySamplesForFormat;
    functions->TexImage = st_TexImage;
    functions->TexSubImage = st_TexSubImage;
-   functions->CompressedTexSubImage = _mesa_store_compressed_texsubimage;
+   functions->CompressedTexSubImage = st_CompressedTexSubImage;
    functions->CopyTexSubImage = st_CopyTexSubImage;
    functions->GenerateMipmap = st_generate_mipmap;
 
diff --git a/src/mesa/state_tracker/st_cb_texture.h b/src/mesa/state_tracker/st_cb_texture.h
index 1b685298c5f..55c86c401e2 100644
--- a/src/mesa/state_tracker/st_cb_texture.h
+++ b/src/mesa/state_tracker/st_cb_texture.h
@@ -53,5 +53,10 @@ st_finalize_texture(struct gl_context *ctx,
 extern void
 st_init_texture_functions(struct dd_function_table *functions);
 
+extern void
+st_init_pbo_upload(struct st_context *st);
+
+extern void
+st_destroy_pbo_upload(struct st_context *st);
 
 #endif /* ST_CB_TEXTURE_H */
diff --git a/src/mesa/state_tracker/st_cb_texturebarrier.c b/src/mesa/state_tracker/st_cb_texturebarrier.c
index dd4dde74c86..2de150ba13a 100644
--- a/src/mesa/state_tracker/st_cb_texturebarrier.c
+++ b/src/mesa/state_tracker/st_cb_texturebarrier.c
@@ -65,6 +65,13 @@ st_MemoryBarrier(struct gl_context *ctx, GLbitfield barriers)
 
    if (barriers & GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT)
       flags |= PIPE_BARRIER_MAPPED_BUFFER;
+   if (barriers & GL_ATOMIC_COUNTER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_SHADER_BUFFER;
+   if (barriers & GL_SHADER_STORAGE_BARRIER_BIT)
+      flags |= PIPE_BARRIER_SHADER_BUFFER;
+
+   if (barriers & GL_QUERY_BUFFER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_QUERY_BUFFER;
 
    if (flags && pipe->memory_barrier)
       pipe->memory_barrier(pipe, flags);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index ce1e97aacb5..446ebfb563f 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -96,6 +96,30 @@ static void st_Enable(struct gl_context * ctx, GLenum cap, GLboolean state)
 }
 
 
+/**
+ * Called via ctx->Driver.QueryMemoryInfo()
+ */
+static void
+st_query_memory_info(struct gl_context *ctx, struct gl_memory_info *out)
+{
+   struct pipe_screen *screen = st_context(ctx)->pipe->screen;
+   struct pipe_memory_info info;
+
+   assert(screen->query_memory_info);
+   if (!screen->query_memory_info)
+      return;
+
+   screen->query_memory_info(screen, &info);
+
+   out->total_device_memory = info.total_device_memory;
+   out->avail_device_memory = info.avail_device_memory;
+   out->total_staging_memory = info.total_staging_memory;
+   out->avail_staging_memory = info.avail_staging_memory;
+   out->device_memory_evicted = info.device_memory_evicted;
+   out->nr_device_memory_evictions = info.nr_device_memory_evictions;
+}
+
+
 /**
  * Called via ctx->Driver.UpdateState()
  */
@@ -136,6 +160,7 @@ st_destroy_context_priv(struct st_context *st)
    st_destroy_drawpix(st);
    st_destroy_drawtex(st);
    st_destroy_perfmon(st);
+   st_destroy_pbo_upload(st);
 
    for (shader = 0; shader < ARRAY_SIZE(st->state.sampler_views); shader++) {
       for (i = 0; i < ARRAY_SIZE(st->state.sampler_views[0]); i++) {
@@ -209,6 +234,7 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    st_init_bitmap(st);
    st_init_clear(st);
    st_init_draw( st );
+   st_init_pbo_upload(st);
 
    /* Choose texture target for glDrawPixels, glBitmap, renderbuffers */
    if (pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES))
@@ -350,6 +376,8 @@ static void st_init_driver_flags(struct gl_driver_flags *f)
    f->NewUniformBuffer = ST_NEW_UNIFORM_BUFFER;
    f->NewDefaultTessLevels = ST_NEW_TESS_STATE;
    f->NewTextureBuffer = ST_NEW_SAMPLER_VIEWS;
+   f->NewAtomicBuffer = ST_NEW_ATOMIC_BUFFER;
+   f->NewShaderStorageBuffer = ST_NEW_STORAGE_BUFFER;
 }
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
@@ -487,4 +515,5 @@ void st_init_driver_functions(struct pipe_screen *screen,
 
    functions->Enable = st_Enable;
    functions->UpdateState = st_invalidate_state;
+   functions->QueryMemoryInfo = st_query_memory_info;
 }
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 9db5f11beb5..57076ad0d18 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -62,6 +62,8 @@ struct u_upload_mgr;
 #define ST_NEW_TESSCTRL_PROGRAM        (1 << 9)
 #define ST_NEW_TESSEVAL_PROGRAM        (1 << 10)
 #define ST_NEW_SAMPLER_VIEWS           (1 << 11)
+#define ST_NEW_ATOMIC_BUFFER           (1 << 12)
+#define ST_NEW_STORAGE_BUFFER          (1 << 13)
 
 
 struct st_state_flags {
@@ -202,6 +204,19 @@ struct st_context
       void *gs_layered;
    } clear;
 
+   /* For gl(Compressed)Tex(Sub)Image */
+   struct {
+      struct pipe_rasterizer_state raster;
+      struct pipe_blend_state blend;
+      void *vs;
+      void *gs;
+      void *fs;
+      bool enabled;
+      bool rgba_only;
+      bool upload_layers;
+      bool use_gs;
+   } pbo_upload;
+
    /** used for anything using util_draw_vertex_buffer */
    struct pipe_vertex_element velems_util_draw[3];
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 53ea6767395..f25bd742f79 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -218,6 +218,11 @@ void st_init_limits(struct pipe_screen *screen,
                                           c->MaxUniformBlockSize / 4 *
                                           pc->MaxUniformBlocks);
 
+      pc->MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
+      pc->MaxAtomicBuffers = screen->get_shader_param(
+            screen, sh, PIPE_SHADER_CAP_MAX_SHADER_BUFFERS) / 2;
+      pc->MaxShaderStorageBlocks = pc->MaxAtomicBuffers;
+
       /* Gallium doesn't really care about local vs. env parameters so use the
        * same limits.
        */
@@ -333,6 +338,31 @@ void st_init_limits(struct pipe_screen *screen,
       screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL);
    c->GLSLFrontFacingIsSysVal =
       screen->get_param(screen, PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL);
+
+   c->MaxAtomicBufferBindings =
+         c->Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers;
+   c->MaxCombinedAtomicBuffers =
+         c->Program[MESA_SHADER_VERTEX].MaxAtomicBuffers +
+         c->Program[MESA_SHADER_TESS_CTRL].MaxAtomicBuffers +
+         c->Program[MESA_SHADER_TESS_EVAL].MaxAtomicBuffers +
+         c->Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers +
+         c->Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers;
+   assert(c->MaxCombinedAtomicBuffers <= MAX_COMBINED_ATOMIC_BUFFERS);
+
+   if (c->MaxCombinedAtomicBuffers > 0)
+      extensions->ARB_shader_atomic_counters = GL_TRUE;
+
+   c->MaxCombinedShaderOutputResources = c->MaxDrawBuffers;
+   c->ShaderStorageBufferOffsetAlignment =
+      screen->get_param(screen, PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT);
+   if (c->ShaderStorageBufferOffsetAlignment) {
+      c->MaxCombinedShaderStorageBlocks = c->MaxShaderStorageBufferBindings =
+         c->MaxCombinedAtomicBuffers;
+      c->MaxCombinedShaderOutputResources +=
+         c->MaxCombinedShaderStorageBlocks;
+      c->MaxShaderStorageBlockSize = 1 << 27;
+      extensions->ARB_shader_storage_buffer_object = GL_TRUE;
+   }
 }
 
 
@@ -465,6 +495,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
       { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
+      { o(ARB_query_buffer_object),          PIPE_CAP_QUERY_BUFFER_OBJECT              },
       { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
       { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
       { o(ARB_shader_draw_parameters),       PIPE_CAP_DRAW_PARAMETERS                  },
@@ -496,12 +527,14 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(EXT_transform_feedback),           PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS        },
 
       { o(AMD_pinned_memory),                PIPE_CAP_RESOURCE_FROM_USER_MEMORY        },
+      { o(ATI_meminfo),                      PIPE_CAP_QUERY_MEMORY_INFO                },
       { o(AMD_seamless_cubemap_per_texture), PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE    },
       { o(ATI_separate_stencil),             PIPE_CAP_TWO_SIDED_STENCIL                },
       { o(ATI_texture_mirror_once),          PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(NV_conditional_render),            PIPE_CAP_CONDITIONAL_RENDER               },
       { o(NV_primitive_restart),             PIPE_CAP_PRIMITIVE_RESTART                },
       { o(NV_texture_barrier),               PIPE_CAP_TEXTURE_BARRIER                  },
+      { o(NVX_gpu_memory_info),              PIPE_CAP_QUERY_MEMORY_INFO                },
       /* GL_NV_point_sprite is not supported by gallium because we don't
        * support the GL_POINT_SPRITE_R_MODE_NV option. */
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index cf91d39ff92..b8182de0be8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -52,7 +52,6 @@
 #include "st_mesa_to_tgsi.h"
 
 
-#define PROGRAM_IMMEDIATE PROGRAM_FILE_MAX
 #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) |    \
                            (1 << PROGRAM_CONSTANT) |     \
                            (1 << PROGRAM_UNIFORM))
@@ -267,6 +266,9 @@ public:
    unsigned tex_offset_num_offset;
    int dead_mask; /**< Used in dead code elimination */
 
+   st_src_reg buffer; /**< buffer register */
+   unsigned buffer_access; /**< buffer access type */
+
    class function_entry *function; /* Set on TGSI_OPCODE_CAL or TGSI_OPCODE_BGNSUB */
    const struct tgsi_opcode_info *info;
 };
@@ -391,6 +393,7 @@ public:
    int samplers_used;
    glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
    int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
+   int buffers_used;
    bool indirect_addr_consts;
    int wpos_transform_const;
 
@@ -444,6 +447,10 @@ public:
    virtual void visit(ir_barrier *);
    /*@}*/
 
+   void visit_atomic_counter_intrinsic(ir_call *);
+   void visit_ssbo_intrinsic(ir_call *);
+   void visit_membar_intrinsic(ir_call *);
+
    st_src_reg result;
 
    /** List of variable_storage */
@@ -557,6 +564,28 @@ swizzle_for_size(int size)
    return size_swizzles[size - 1];
 }
 
+static bool
+is_resource_instruction(unsigned opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_RESQ:
+   case TGSI_OPCODE_LOAD:
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      return true;
+   default:
+      return false;
+   }
+}
+
 static unsigned
 num_inst_dst_regs(const glsl_to_tgsi_instruction *op)
 {
@@ -566,7 +595,8 @@ num_inst_dst_regs(const glsl_to_tgsi_instruction *op)
 static unsigned
 num_inst_src_regs(const glsl_to_tgsi_instruction *op)
 {
-   return op->info->is_tex ? op->info->num_src - 1 : op->info->num_src;
+   return op->info->is_tex || is_resource_instruction(op->op) ?
+      op->info->num_src - 1 : op->info->num_src;
 }
 
 glsl_to_tgsi_instruction *
@@ -661,8 +691,6 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
       }
    }
 
-   this->instructions.push_tail(inst);
-
    /*
     * This section contains the double processing.
     * GLSL just represents doubles as single channel values,
@@ -698,7 +726,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
       int initial_src_swz[4], initial_src_idx[4];
       int initial_dst_idx[2], initial_dst_writemask[2];
       /* select the writemask for dst0 or dst1 */
-      unsigned writemask = inst->dst[0].file == PROGRAM_UNDEFINED ? inst->dst[1].writemask : inst->dst[0].writemask;
+      unsigned writemask = inst->dst[1].file == PROGRAM_UNDEFINED ? inst->dst[0].writemask : inst->dst[1].writemask;
 
       /* copy out the writemask, index and swizzles for all src/dsts. */
       for (j = 0; j < 2; j++) {
@@ -715,10 +743,22 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
        * scan all the components in the dst writemask
        * generate an instruction for each of them if required.
        */
+      st_src_reg addr;
       while (writemask) {
 
          int i = u_bit_scan(&writemask);
 
+         /* before emitting the instruction, see if we have to adjust store
+          * address */
+         if (i > 1 && inst->op == TGSI_OPCODE_STORE &&
+             addr.file == PROGRAM_UNDEFINED) {
+            /* We have to advance the buffer address by 16 */
+            addr = get_temp(glsl_type::uint_type);
+            emit_asm(ir, TGSI_OPCODE_UADD, st_dst_reg(addr),
+                     inst->src[0], st_src_reg_for_int(16));
+         }
+
+
          /* first time use previous instruction */
          if (dinst == NULL) {
             dinst = inst;
@@ -728,16 +768,21 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
             *dinst = *inst;
             dinst->next = NULL;
             dinst->prev = NULL;
-            this->instructions.push_tail(dinst);
          }
+         this->instructions.push_tail(dinst);
 
          /* modify the destination if we are splitting */
          for (j = 0; j < 2; j++) {
             if (dst_is_double[j]) {
                dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
                dinst->dst[j].index = initial_dst_idx[j];
-               if (i > 1)
+               if (i > 1) {
+                  if (dinst->op == TGSI_OPCODE_STORE) {
+                     dinst->src[0] = addr;
+                  } else {
                      dinst->dst[j].index++;
+                  }
+               }
             } else {
                /* if we aren't writing to a double, just get the bit of the initial writemask
                   for this channel */
@@ -773,6 +818,8 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
          }
       }
       inst = dinst;
+   } else {
+      this->instructions.push_tail(inst);
    }
 
 
@@ -807,7 +854,9 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
    assert(src1.type != GLSL_TYPE_ARRAY);
    assert(src1.type != GLSL_TYPE_STRUCT);
 
-   if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
+   if (is_resource_instruction(op))
+      type = src1.type;
+   else if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
       type = GLSL_TYPE_DOUBLE;
    else if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
       type = GLSL_TYPE_FLOAT;
@@ -891,6 +940,9 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
       case3fid(FLR, FLR, DFLR);
       case3fid(ROUND, ROUND, DROUND);
 
+      case2iu(ATOMIMAX, ATOMUMAX);
+      case2iu(ATOMIMIN, ATOMUMIN);
+
       default: break;
    }
 
@@ -2170,6 +2222,22 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
       break;
 
+   case ir_unop_get_buffer_size: {
+      ir_constant *const_offset = ir->operands[0]->as_constant();
+      st_src_reg buffer(
+            PROGRAM_BUFFER,
+            ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
+            (const_offset ? const_offset->value.u[0] : 0),
+            GLSL_TYPE_UINT);
+      if (!const_offset) {
+         buffer.reladdr = ralloc(mem_ctx, st_src_reg);
+         memcpy(buffer.reladdr, &sampler_reladdr, sizeof(sampler_reladdr));
+         emit_arl(ir, sampler_reladdr, op[0]);
+      }
+      emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->buffer = buffer;
+      break;
+   }
+
    case ir_unop_pack_snorm_2x16:
    case ir_unop_pack_unorm_2x16:
    case ir_unop_pack_snorm_4x8:
@@ -2190,10 +2258,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        */
       assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
       break;
-
-   case ir_unop_get_buffer_size:
-      assert(!"Not implemented yet");
-      break;
    }
 
    this->result = result_src;
@@ -3070,14 +3134,242 @@ glsl_to_tgsi_visitor::get_function_signature(ir_function_signature *sig)
    return entry;
 }
 
+void
+glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
+{
+   const char *callee = ir->callee->function_name();
+   ir_dereference *deref = static_cast<ir_dereference *>(
+      ir->actual_parameters.get_head());
+   ir_variable *location = deref->variable_referenced();
+
+   st_src_reg buffer(
+         PROGRAM_BUFFER, location->data.binding, GLSL_TYPE_ATOMIC_UINT);
+
+   /* Calculate the surface offset */
+   st_src_reg offset;
+   ir_dereference_array *deref_array = deref->as_dereference_array();
+
+   if (deref_array) {
+      offset = get_temp(glsl_type::uint_type);
+
+      deref_array->array_index->accept(this);
+
+      emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
+               this->result, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
+      emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
+               offset, st_src_reg_for_int(location->data.offset));
+   } else {
+      offset = st_src_reg_for_int(location->data.offset);
+   }
+
+   ir->return_deref->accept(this);
+   st_dst_reg dst(this->result);
+   dst.writemask = WRITEMASK_X;
+
+   glsl_to_tgsi_instruction *inst;
+
+   if (!strcmp("__intrinsic_atomic_read", callee)) {
+      inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset);
+      inst->buffer = buffer;
+   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
+      inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
+                      st_src_reg_for_int(1));
+      inst->buffer = buffer;
+   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
+      inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
+                      st_src_reg_for_int(-1));
+      inst->buffer = buffer;
+      emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1));
+   }
+}
+
+void
+glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
+{
+   const char *callee = ir->callee->function_name();
+   exec_node *param = ir->actual_parameters.get_head();
+
+   ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
+
+   param = param->get_next();
+   ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
+
+   ir_constant *const_block = block->as_constant();
+
+   st_src_reg buffer(
+         PROGRAM_BUFFER,
+         ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
+         (const_block ? const_block->value.u[0] : 0),
+         GLSL_TYPE_UINT);
+
+   if (!const_block) {
+      block->accept(this);
+      emit_arl(ir, sampler_reladdr, this->result);
+      buffer.reladdr = ralloc(mem_ctx, st_src_reg);
+      memcpy(buffer.reladdr, &sampler_reladdr, sizeof(sampler_reladdr));
+   }
+
+   /* Calculate the surface offset */
+   offset->accept(this);
+   st_src_reg off = this->result;
+
+   st_dst_reg dst = undef_dst;
+   if (ir->return_deref) {
+      ir->return_deref->accept(this);
+      dst = st_dst_reg(this->result);
+      dst.writemask = (1 << ir->return_deref->type->vector_elements) - 1;
+   }
+
+   glsl_to_tgsi_instruction *inst;
+
+   if (!strcmp("__intrinsic_load_ssbo", callee)) {
+      inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
+      if (dst.type == GLSL_TYPE_BOOL)
+         emit_asm(ir, TGSI_OPCODE_USNE, dst, st_src_reg(dst), st_src_reg_for_int(0));
+   } else if (!strcmp("__intrinsic_store_ssbo", callee)) {
+      param = param->get_next();
+      ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
+      val->accept(this);
+
+      param = param->get_next();
+      ir_constant *write_mask = ((ir_instruction *)param)->as_constant();
+      assert(write_mask);
+      dst.writemask = write_mask->value.u[0];
+
+      dst.type = this->result.type;
+      inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
+   } else {
+      param = param->get_next();
+      ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
+      val->accept(this);
+
+      st_src_reg data = this->result, data2 = undef_src;
+      unsigned opcode;
+      if (!strcmp("__intrinsic_atomic_add_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMUADD;
+      else if (!strcmp("__intrinsic_atomic_min_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMIMIN;
+      else if (!strcmp("__intrinsic_atomic_max_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMIMAX;
+      else if (!strcmp("__intrinsic_atomic_and_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMAND;
+      else if (!strcmp("__intrinsic_atomic_or_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMOR;
+      else if (!strcmp("__intrinsic_atomic_xor_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMXOR;
+      else if (!strcmp("__intrinsic_atomic_exchange_ssbo", callee))
+         opcode = TGSI_OPCODE_ATOMXCHG;
+      else if (!strcmp("__intrinsic_atomic_comp_swap_ssbo", callee)) {
+         opcode = TGSI_OPCODE_ATOMCAS;
+         param = param->get_next();
+         val = ((ir_instruction *)param)->as_rvalue();
+         val->accept(this);
+         data2 = this->result;
+      } else {
+         assert(!"Unexpected intrinsic");
+         return;
+      }
+
+      inst = emit_asm(ir, opcode, dst, off, data, data2);
+   }
+
+   param = param->get_next();
+   ir_constant *access = NULL;
+   if (!param->is_tail_sentinel()) {
+      access = ((ir_instruction *)param)->as_constant();
+      assert(access);
+   }
+
+   /* The emit_asm() might have actually split the op into pieces, e.g. for
+    * double stores. We have to go back and fix up all the generated ops.
+    */
+   unsigned op = inst->op;
+   do {
+      inst->buffer = buffer;
+      if (access)
+         inst->buffer_access = access->value.u[0];
+      inst = (glsl_to_tgsi_instruction *)inst->get_prev();
+      if (inst->op == TGSI_OPCODE_UADD)
+         inst = (glsl_to_tgsi_instruction *)inst->get_prev();
+   } while (inst && inst->buffer.file == PROGRAM_UNDEFINED && inst->op == op);
+}
+
+void
+glsl_to_tgsi_visitor::visit_membar_intrinsic(ir_call *ir)
+{
+   const char *callee = ir->callee->function_name();
+
+   if (!strcmp("__intrinsic_memory_barrier", callee))
+      emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
+               st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
+                                  TGSI_MEMBAR_ATOMIC_BUFFER |
+                                  TGSI_MEMBAR_SHADER_IMAGE |
+                                  TGSI_MEMBAR_SHARED));
+   else if (!strcmp("__intrinsic_memory_barrier_atomic_counter", callee))
+      emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
+               st_src_reg_for_int(TGSI_MEMBAR_ATOMIC_BUFFER));
+   else if (!strcmp("__intrinsic_memory_barrier_buffer", callee))
+      emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
+               st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER));
+   else if (!strcmp("__intrinsic_memory_barrier_image", callee))
+      emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
+               st_src_reg_for_int(TGSI_MEMBAR_SHADER_IMAGE));
+   else if (!strcmp("__intrinsic_memory_barrier_shared", callee))
+      emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
+               st_src_reg_for_int(TGSI_MEMBAR_SHARED));
+   else if (!strcmp("__intrinsic_group_memory_barrier", callee))
+      emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
+               st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
+                                  TGSI_MEMBAR_ATOMIC_BUFFER |
+                                  TGSI_MEMBAR_SHADER_IMAGE |
+                                  TGSI_MEMBAR_SHARED |
+                                  TGSI_MEMBAR_THREAD_GROUP));
+   else
+      assert(!"Unexpected memory barrier intrinsic");
+}
+
 void
 glsl_to_tgsi_visitor::visit(ir_call *ir)
 {
    glsl_to_tgsi_instruction *call_inst;
    ir_function_signature *sig = ir->callee;
-   function_entry *entry = get_function_signature(sig);
+   const char *callee = sig->function_name();
+   function_entry *entry;
    int i;
 
+   /* Filter out intrinsics */
+   if (!strcmp("__intrinsic_atomic_read", callee) ||
+       !strcmp("__intrinsic_atomic_increment", callee) ||
+       !strcmp("__intrinsic_atomic_predecrement", callee)) {
+      visit_atomic_counter_intrinsic(ir);
+      return;
+   }
+
+   if (!strcmp("__intrinsic_load_ssbo", callee) ||
+       !strcmp("__intrinsic_store_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_add_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_min_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_max_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_and_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_or_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_xor_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_exchange_ssbo", callee) ||
+       !strcmp("__intrinsic_atomic_comp_swap_ssbo", callee)) {
+      visit_ssbo_intrinsic(ir);
+      return;
+   }
+
+   if (!strcmp("__intrinsic_memory_barrier", callee) ||
+       !strcmp("__intrinsic_memory_barrier_atomic_counter", callee) ||
+       !strcmp("__intrinsic_memory_barrier_buffer", callee) ||
+       !strcmp("__intrinsic_memory_barrier_image", callee) ||
+       !strcmp("__intrinsic_memory_barrier_shared", callee) ||
+       !strcmp("__intrinsic_group_memory_barrier", callee)) {
+      visit_membar_intrinsic(ir);
+      return;
+   }
+
+   entry = get_function_signature(sig);
    /* Process in parameters. */
    foreach_two_lists(formal_node, &sig->parameters,
                      actual_node, &ir->actual_parameters) {
@@ -3583,6 +3875,7 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
    current_function = NULL;
    num_address_regs = 0;
    samplers_used = 0;
+   buffers_used = 0;
    indirect_addr_consts = false;
    wpos_transform_const = -1;
    glsl_version = 0;
@@ -3617,6 +3910,7 @@ static void
 count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
 {
    v->samplers_used = 0;
+   v->buffers_used = 0;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
       if (inst->info->is_tex) {
@@ -3634,6 +3928,12 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
             }
          }
       }
+      if (inst->buffer.file != PROGRAM_UNDEFINED && (
+                is_resource_instruction(inst->op) ||
+                inst->op == TGSI_OPCODE_STORE)) {
+         if (inst->buffer.file == PROGRAM_BUFFER)
+            v->buffers_used |= 1 << inst->buffer.index;
+      }
    }
    prog->SamplersUsed = v->samplers_used;
 
@@ -3822,9 +4122,11 @@ glsl_to_tgsi_visitor::get_last_temp_read_first_temp_write(int *last_reads, int *
             last_reads[inst->src[j].index] = (depth == 0) ? i : -2;
       }
       for (j = 0; j < num_inst_dst_regs(inst); j++) {
-         if (inst->dst[j].file == PROGRAM_TEMPORARY)
+         if (inst->dst[j].file == PROGRAM_TEMPORARY) {
             if (first_writes[inst->dst[j].index] == -1)
                first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
+            last_reads[inst->dst[j].index] = (depth == 0) ? i : -2;
+         }
       }
       for (j = 0; j < inst->tex_offset_num_offset; j++) {
          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
@@ -4229,7 +4531,11 @@ glsl_to_tgsi_visitor::eliminate_dead_code(void)
    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
       if (!inst->dead_mask || !inst->dst[0].writemask)
          continue;
-      else if ((inst->dst[0].writemask & ~inst->dead_mask) == 0) {
+      /* No amount of dead masks should remove memory stores */
+      if (inst->info->is_store)
+         continue;
+
+      if ((inst->dst[0].writemask & ~inst->dead_mask) == 0) {
          inst->remove();
          delete inst;
          removed++;
@@ -4338,6 +4644,7 @@ glsl_to_tgsi_visitor::merge_registers(void)
             /* Update the first_writes and last_reads arrays with the new
              * values for the merged register index, and mark the newly unused
              * register index as such. */
+            assert(last_reads[j] >= last_reads[i]);
             last_reads[i] = last_reads[j];
             first_writes[j] = -1;
             last_reads[j] = -1;
@@ -4407,6 +4714,7 @@ struct st_translate {
    struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
    struct ureg_dst address[3];
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
+   struct ureg_src buffers[PIPE_MAX_SHADER_BUFFERS];
    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
    struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
    unsigned *array_sizes;
@@ -4814,13 +5122,13 @@ compile_tgsi_instruction(struct st_translate *t,
                          const glsl_to_tgsi_instruction *inst)
 {
    struct ureg_program *ureg = t->ureg;
-   GLuint i;
+   int i;
    struct ureg_dst dst[2];
    struct ureg_src src[4];
    struct tgsi_texture_offset texoffsets[MAX_GLSL_TEXTURE_OFFSET];
 
-   unsigned num_dst;
-   unsigned num_src;
+   int num_dst;
+   int num_src;
    unsigned tex_target;
 
    num_dst = num_inst_dst_regs(inst);
@@ -4868,7 +5176,7 @@ compile_tgsi_instruction(struct st_translate *t,
          src[num_src] =
             ureg_src_indirect(src[num_src], ureg_src(t->address[2]));
       num_src++;
-      for (i = 0; i < inst->tex_offset_num_offset; i++) {
+      for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
          texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i], i);
       }
       tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
@@ -4881,6 +5189,38 @@ compile_tgsi_instruction(struct st_translate *t,
                     src, num_src);
       return;
 
+   case TGSI_OPCODE_RESQ:
+   case TGSI_OPCODE_LOAD:
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      for (i = num_src - 1; i >= 0; i--)
+         src[i + 1] = src[i];
+      num_src++;
+      src[0] = t->buffers[inst->buffer.index];
+      if (inst->buffer.reladdr)
+         src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2]));
+      assert(src[0].File != TGSI_FILE_NULL);
+      ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
+                       inst->buffer_access);
+      break;
+
+   case TGSI_OPCODE_STORE:
+      dst[0] = ureg_writemask(ureg_dst(t->buffers[inst->buffer.index]), inst->dst[0].writemask);
+      if (inst->buffer.reladdr)
+         dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2]));
+      assert(dst[0].File != TGSI_FILE_NULL);
+      ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
+                       inst->buffer_access);
+      break;
+
    case TGSI_OPCODE_SCS:
       dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XY);
       ureg_insn(ureg, inst->op, dst, num_dst, src, num_src);
@@ -5170,6 +5510,8 @@ st_translate_program(
 {
    struct st_translate *t;
    unsigned i;
+   struct gl_program_constants *frag_const =
+      &ctx->Const.Program[MESA_SHADER_FRAGMENT];
    enum pipe_error ret = PIPE_OK;
 
    assert(numInputs <= ARRAY_SIZE(t->inputs));
@@ -5485,7 +5827,7 @@ st_translate_program(
    assert(i == program->num_immediates);
 
    /* texture samplers */
-   for (i = 0; i < ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; i++) {
+   for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
       if (program->samplers_used & (1 << i)) {
          unsigned type;
 
@@ -5510,6 +5852,21 @@ st_translate_program(
       }
    }
 
+   for (i = 0; i < frag_const->MaxAtomicBuffers; i++) {
+      if (program->buffers_used & (1 << i)) {
+         t->buffers[i] = ureg_DECL_buffer(ureg, i, true);
+      }
+   }
+
+   for (; i < frag_const->MaxAtomicBuffers + frag_const->MaxShaderStorageBlocks;
+        i++) {
+      if (program->buffers_used & (1 << i)) {
+         t->buffers[i] = ureg_DECL_buffer(ureg, i, false);
+      }
+   }
+
+
+
    /* Emit each instruction in turn:
     */
    foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions) {
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 0b8b6a9de56..6494aa518a2 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -180,6 +180,9 @@ vbo_sizeof_ib_type(GLenum type)
    }
 }
 
+void
+vbo_delete_minmax_cache(struct gl_buffer_object *bufferObj);
+
 void
 vbo_get_minmax_indices(struct gl_context *ctx, const struct _mesa_prim *prim,
                        const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 02139ef881f..f0245fd08cc 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -37,8 +37,6 @@
 #include "main/enums.h"
 #include "main/macros.h"
 #include "main/transformfeedback.h"
-#include "main/sse_minmax.h"
-#include "x86/common_x86_asm.h"
 
 #include "vbo_context.h"
 
@@ -80,152 +78,6 @@ vbo_check_buffers_are_unmapped(struct gl_context *ctx)
 }
 
 
-
-/**
- * Compute min and max elements by scanning the index buffer for
- * glDraw[Range]Elements() calls.
- * If primitive restart is enabled, we need to ignore restart
- * indexes when computing min/max.
- */
-static void
-vbo_get_minmax_index(struct gl_context *ctx,
-		     const struct _mesa_prim *prim,
-		     const struct _mesa_index_buffer *ib,
-		     GLuint *min_index, GLuint *max_index,
-		     const GLuint count)
-{
-   const GLboolean restart = ctx->Array._PrimitiveRestart;
-   const GLuint restartIndex = _mesa_primitive_restart_index(ctx, ib->type);
-   const int index_size = vbo_sizeof_ib_type(ib->type);
-   const char *indices;
-   GLuint i;
-
-   indices = (char *) ib->ptr + prim->start * index_size;
-   if (_mesa_is_bufferobj(ib->obj)) {
-      GLsizeiptr size = MIN2(count * index_size, ib->obj->Size);
-      indices = ctx->Driver.MapBufferRange(ctx, (GLintptr) indices, size,
-                                           GL_MAP_READ_BIT, ib->obj,
-                                           MAP_INTERNAL);
-   }
-
-   switch (ib->type) {
-   case GL_UNSIGNED_INT: {
-      const GLuint *ui_indices = (const GLuint *)indices;
-      GLuint max_ui = 0;
-      GLuint min_ui = ~0U;
-      if (restart) {
-         for (i = 0; i < count; i++) {
-            if (ui_indices[i] != restartIndex) {
-               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
-               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
-            }
-         }
-      }
-      else {
-#if defined(USE_SSE41)
-         if (cpu_has_sse4_1) {
-            _mesa_uint_array_min_max(ui_indices, &min_ui, &max_ui, count);
-         }
-         else
-#endif
-            for (i = 0; i < count; i++) {
-               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
-               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
-            }
-      }
-      *min_index = min_ui;
-      *max_index = max_ui;
-      break;
-   }
-   case GL_UNSIGNED_SHORT: {
-      const GLushort *us_indices = (const GLushort *)indices;
-      GLuint max_us = 0;
-      GLuint min_us = ~0U;
-      if (restart) {
-         for (i = 0; i < count; i++) {
-            if (us_indices[i] != restartIndex) {
-               if (us_indices[i] > max_us) max_us = us_indices[i];
-               if (us_indices[i] < min_us) min_us = us_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < count; i++) {
-            if (us_indices[i] > max_us) max_us = us_indices[i];
-            if (us_indices[i] < min_us) min_us = us_indices[i];
-         }
-      }
-      *min_index = min_us;
-      *max_index = max_us;
-      break;
-   }
-   case GL_UNSIGNED_BYTE: {
-      const GLubyte *ub_indices = (const GLubyte *)indices;
-      GLuint max_ub = 0;
-      GLuint min_ub = ~0U;
-      if (restart) {
-         for (i = 0; i < count; i++) {
-            if (ub_indices[i] != restartIndex) {
-               if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
-               if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < count; i++) {
-            if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
-            if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
-         }
-      }
-      *min_index = min_ub;
-      *max_index = max_ub;
-      break;
-   }
-   default:
-      unreachable("not reached");
-   }
-
-   if (_mesa_is_bufferobj(ib->obj)) {
-      ctx->Driver.UnmapBuffer(ctx, ib->obj, MAP_INTERNAL);
-   }
-}
-
-/**
- * Compute min and max elements for nr_prims
- */
-void
-vbo_get_minmax_indices(struct gl_context *ctx,
-                       const struct _mesa_prim *prims,
-                       const struct _mesa_index_buffer *ib,
-                       GLuint *min_index,
-                       GLuint *max_index,
-                       GLuint nr_prims)
-{
-   GLuint tmp_min, tmp_max;
-   GLuint i;
-   GLuint count;
-
-   *min_index = ~0;
-   *max_index = 0;
-
-   for (i = 0; i < nr_prims; i++) {
-      const struct _mesa_prim *start_prim;
-
-      start_prim = &prims[i];
-      count = start_prim->count;
-      /* Do combination if possible to reduce map/unmap count */
-      while ((i + 1 < nr_prims) &&
-             (prims[i].start + prims[i].count == prims[i+1].start)) {
-         count += prims[i+1].count;
-         i++;
-      }
-      vbo_get_minmax_index(ctx, start_prim, ib, &tmp_min, &tmp_max, count);
-      *min_index = MIN2(*min_index, tmp_min);
-      *max_index = MAX2(*max_index, tmp_max);
-   }
-}
-
-
 /**
  * Check that element 'j' of the array has reasonable data.
  * Map VBO if needed.
diff --git a/src/mesa/vbo/vbo_minmax_index.c b/src/mesa/vbo/vbo_minmax_index.c
new file mode 100644
index 00000000000..0f75a87f3f3
--- /dev/null
+++ b/src/mesa/vbo/vbo_minmax_index.c
@@ -0,0 +1,378 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2003 VMware, Inc.
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/varray.h"
+#include "main/macros.h"
+#include "main/sse_minmax.h"
+#include "x86/common_x86_asm.h"
+#include "util/hash_table.h"
+
+
+struct minmax_cache_key {
+   GLintptr offset;
+   GLuint count;
+   GLenum type;
+};
+
+
+struct minmax_cache_entry {
+   struct minmax_cache_key key;
+   GLuint min;
+   GLuint max;
+};
+
+
+static uint32_t
+vbo_minmax_cache_hash(const struct minmax_cache_key *key)
+{
+   return _mesa_hash_data(key, sizeof(*key));
+}
+
+
+static bool
+vbo_minmax_cache_key_equal(const struct minmax_cache_key *a,
+                           const struct minmax_cache_key *b)
+{
+   return (a->offset == b->offset) && (a->count == b->count) && (a->type == b->type);
+}
+
+
+static void
+vbo_minmax_cache_delete_entry(struct hash_entry *entry)
+{
+   free(entry->data);
+}
+
+
+static GLboolean
+vbo_use_minmax_cache(struct gl_buffer_object *bufferObj)
+{
+   if (bufferObj->UsageHistory & (USAGE_TEXTURE_BUFFER |
+                                  USAGE_ATOMIC_COUNTER_BUFFER |
+                                  USAGE_SHADER_STORAGE_BUFFER |
+                                  USAGE_TRANSFORM_FEEDBACK_BUFFER |
+                                  USAGE_PIXEL_PACK_BUFFER |
+                                  USAGE_DISABLE_MINMAX_CACHE))
+      return GL_FALSE;
+
+   if ((bufferObj->Mappings[MAP_USER].AccessFlags &
+        (GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT)) ==
+       (GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT))
+      return GL_FALSE;
+
+   return GL_TRUE;
+}
+
+
+void
+vbo_delete_minmax_cache(struct gl_buffer_object *bufferObj)
+{
+   _mesa_hash_table_destroy(bufferObj->MinMaxCache, vbo_minmax_cache_delete_entry);
+   bufferObj->MinMaxCache = NULL;
+}
+
+
+static GLboolean
+vbo_get_minmax_cached(struct gl_buffer_object *bufferObj,
+                      GLenum type, GLintptr offset, GLuint count,
+                      GLuint *min_index, GLuint *max_index)
+{
+   GLboolean found = GL_FALSE;
+   struct minmax_cache_key key;
+   uint32_t hash;
+   struct hash_entry *result;
+
+   if (!bufferObj->MinMaxCache)
+      return GL_FALSE;
+   if (!vbo_use_minmax_cache(bufferObj))
+      return GL_FALSE;
+
+   mtx_lock(&bufferObj->Mutex);
+
+   if (bufferObj->MinMaxCacheDirty) {
+      /* Disable the cache permanently for this BO if the number of hits
+       * is asymptotically less than the number of misses. This happens when
+       * applications use the BO for streaming.
+       *
+       * However, some initial optimism allows applications that interleave
+       * draw calls with glBufferSubData during warmup.
+       */
+      unsigned optimism = bufferObj->Size;
+      if (bufferObj->MinMaxCacheMissIndices > optimism &&
+          bufferObj->MinMaxCacheHitIndices < bufferObj->MinMaxCacheMissIndices - optimism) {
+         bufferObj->UsageHistory |= USAGE_DISABLE_MINMAX_CACHE;
+         vbo_delete_minmax_cache(bufferObj);
+         goto out_disable;
+      }
+
+      _mesa_hash_table_clear(bufferObj->MinMaxCache, vbo_minmax_cache_delete_entry);
+      bufferObj->MinMaxCacheDirty = false;
+      goto out_invalidate;
+   }
+
+   key.type = type;
+   key.offset = offset;
+   key.count = count;
+   hash = vbo_minmax_cache_hash(&key);
+   result = _mesa_hash_table_search_pre_hashed(bufferObj->MinMaxCache, hash, &key);
+   if (result) {
+      struct minmax_cache_entry *entry = result->data;
+      *min_index = entry->min;
+      *max_index = entry->max;
+      found = GL_TRUE;
+   }
+
+out_invalidate:
+   if (found) {
+      /* The hit counter saturates so that we don't accidently disable the
+       * cache in a long-running program.
+       */
+      unsigned new_hit_count = bufferObj->MinMaxCacheHitIndices + count;
+
+      if (new_hit_count >= bufferObj->MinMaxCacheHitIndices)
+         bufferObj->MinMaxCacheHitIndices = new_hit_count;
+      else
+         bufferObj->MinMaxCacheHitIndices = ~(unsigned)0;
+   } else {
+      bufferObj->MinMaxCacheMissIndices += count;
+   }
+
+out_disable:
+   mtx_unlock(&bufferObj->Mutex);
+   return found;
+}
+
+
+static void
+vbo_minmax_cache_store(struct gl_context *ctx,
+                       struct gl_buffer_object *bufferObj,
+                       GLenum type, GLintptr offset, GLuint count,
+                       GLuint min, GLuint max)
+{
+   struct minmax_cache_entry *entry;
+   struct hash_entry *table_entry;
+   uint32_t hash;
+
+   if (!vbo_use_minmax_cache(bufferObj))
+      return;
+
+   mtx_lock(&bufferObj->Mutex);
+
+   if (!bufferObj->MinMaxCache) {
+      bufferObj->MinMaxCache =
+         _mesa_hash_table_create(NULL,
+                                 (uint32_t (*)(const void *))vbo_minmax_cache_hash,
+                                 (bool (*)(const void *, const void *))vbo_minmax_cache_key_equal);
+      if (!bufferObj->MinMaxCache)
+         goto out;
+   }
+
+   entry = MALLOC_STRUCT(minmax_cache_entry);
+   if (!entry)
+      goto out;
+
+   entry->key.offset = offset;
+   entry->key.count = count;
+   entry->key.type = type;
+   entry->min = min;
+   entry->max = max;
+   hash = vbo_minmax_cache_hash(&entry->key);
+
+   table_entry = _mesa_hash_table_search_pre_hashed(bufferObj->MinMaxCache,
+                                                    hash, &entry->key);
+   if (table_entry) {
+      /* It seems like this could happen when two contexts are rendering using
+       * the same buffer object from multiple threads.
+       */
+      _mesa_debug(ctx, "duplicate entry in minmax cache\n");
+      free(entry);
+      goto out;
+   }
+
+   table_entry = _mesa_hash_table_insert_pre_hashed(bufferObj->MinMaxCache,
+                                                    hash, &entry->key, entry);
+   if (!table_entry)
+      free(entry);
+
+out:
+   mtx_unlock(&bufferObj->Mutex);
+}
+
+
+/**
+ * Compute min and max elements by scanning the index buffer for
+ * glDraw[Range]Elements() calls.
+ * If primitive restart is enabled, we need to ignore restart
+ * indexes when computing min/max.
+ */
+static void
+vbo_get_minmax_index(struct gl_context *ctx,
+                     const struct _mesa_prim *prim,
+                     const struct _mesa_index_buffer *ib,
+                     GLuint *min_index, GLuint *max_index,
+                     const GLuint count)
+{
+   const GLboolean restart = ctx->Array._PrimitiveRestart;
+   const GLuint restartIndex = _mesa_primitive_restart_index(ctx, ib->type);
+   const int index_size = vbo_sizeof_ib_type(ib->type);
+   const char *indices;
+   GLuint i;
+
+   indices = (char *) ib->ptr + prim->start * index_size;
+   if (_mesa_is_bufferobj(ib->obj)) {
+      GLsizeiptr size = MIN2(count * index_size, ib->obj->Size);
+
+      if (vbo_get_minmax_cached(ib->obj, ib->type, (GLintptr) indices, count,
+                                min_index, max_index))
+         return;
+
+      indices = ctx->Driver.MapBufferRange(ctx, (GLintptr) indices, size,
+                                           GL_MAP_READ_BIT, ib->obj,
+                                           MAP_INTERNAL);
+   }
+
+   switch (ib->type) {
+   case GL_UNSIGNED_INT: {
+      const GLuint *ui_indices = (const GLuint *)indices;
+      GLuint max_ui = 0;
+      GLuint min_ui = ~0U;
+      if (restart) {
+         for (i = 0; i < count; i++) {
+            if (ui_indices[i] != restartIndex) {
+               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
+               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
+            }
+         }
+      }
+      else {
+#if defined(USE_SSE41)
+         if (cpu_has_sse4_1) {
+            _mesa_uint_array_min_max(ui_indices, &min_ui, &max_ui, count);
+         }
+         else
+#endif
+            for (i = 0; i < count; i++) {
+               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
+               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
+            }
+      }
+      *min_index = min_ui;
+      *max_index = max_ui;
+      break;
+   }
+   case GL_UNSIGNED_SHORT: {
+      const GLushort *us_indices = (const GLushort *)indices;
+      GLuint max_us = 0;
+      GLuint min_us = ~0U;
+      if (restart) {
+         for (i = 0; i < count; i++) {
+            if (us_indices[i] != restartIndex) {
+               if (us_indices[i] > max_us) max_us = us_indices[i];
+               if (us_indices[i] < min_us) min_us = us_indices[i];
+            }
+         }
+      }
+      else {
+         for (i = 0; i < count; i++) {
+            if (us_indices[i] > max_us) max_us = us_indices[i];
+            if (us_indices[i] < min_us) min_us = us_indices[i];
+         }
+      }
+      *min_index = min_us;
+      *max_index = max_us;
+      break;
+   }
+   case GL_UNSIGNED_BYTE: {
+      const GLubyte *ub_indices = (const GLubyte *)indices;
+      GLuint max_ub = 0;
+      GLuint min_ub = ~0U;
+      if (restart) {
+         for (i = 0; i < count; i++) {
+            if (ub_indices[i] != restartIndex) {
+               if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
+               if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
+            }
+         }
+      }
+      else {
+         for (i = 0; i < count; i++) {
+            if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
+            if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
+         }
+      }
+      *min_index = min_ub;
+      *max_index = max_ub;
+      break;
+   }
+   default:
+      unreachable("not reached");
+   }
+
+   if (_mesa_is_bufferobj(ib->obj)) {
+      vbo_minmax_cache_store(ctx, ib->obj, ib->type, prim->start, count,
+                             *min_index, *max_index);
+      ctx->Driver.UnmapBuffer(ctx, ib->obj, MAP_INTERNAL);
+   }
+}
+
+/**
+ * Compute min and max elements for nr_prims
+ */
+void
+vbo_get_minmax_indices(struct gl_context *ctx,
+                       const struct _mesa_prim *prims,
+                       const struct _mesa_index_buffer *ib,
+                       GLuint *min_index,
+                       GLuint *max_index,
+                       GLuint nr_prims)
+{
+   GLuint tmp_min, tmp_max;
+   GLuint i;
+   GLuint count;
+
+   *min_index = ~0;
+   *max_index = 0;
+
+   for (i = 0; i < nr_prims; i++) {
+      const struct _mesa_prim *start_prim;
+
+      start_prim = &prims[i];
+      count = start_prim->count;
+      /* Do combination if possible to reduce map/unmap count */
+      while ((i + 1 < nr_prims) &&
+             (prims[i].start + prims[i].count == prims[i+1].start)) {
+         count += prims[i+1].count;
+         i++;
+      }
+      vbo_get_minmax_index(ctx, start_prim, ib, &tmp_min, &tmp_max, count);
+      *min_index = MIN2(*min_index, tmp_min);
+      *max_index = MAX2(*max_index, tmp_max);
+   }
+}
diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S
index c185f62099e..b0aca19c8b0 100644
--- a/src/mesa/x86-64/xform4.S
+++ b/src/mesa/x86-64/xform4.S
@@ -69,7 +69,7 @@ _mesa_x86_64_transform_points4_general:
 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
 
-	prefetch 16(%rdx)
+	prefetcht1 16(%rdx)
 
 	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
 	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
@@ -80,7 +80,7 @@ _mesa_x86_64_transform_points4_general:
 p4_general_loop:
 
 	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
-	prefetchw 16(%rdi)
+	prefetcht1 16(%rdi)
 
 	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
 	addq %rax, %rdx
@@ -93,7 +93,7 @@ p4_general_loop:
 	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
 	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
 	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
-	prefetch 16(%rdx)
+	prefetcht1 16(%rdx)
 	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
 
 	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
@@ -150,7 +150,7 @@ _mesa_x86_64_transform_points4_3d:
 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
 
-	prefetch 16(%rdx)
+	prefetcht1 16(%rdx)
 
 	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
 	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
@@ -166,7 +166,7 @@ _mesa_x86_64_transform_points4_3d:
 p4_3d_loop:
 
 	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
-	prefetchw 16(%rdi)
+	prefetcht1 16(%rdi)
 
 	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
 	addq %rax, %rdx
@@ -179,7 +179,7 @@ p4_3d_loop:
 	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
 	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
 	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
-	prefetch 16(%rdx)
+	prefetcht1 16(%rdx)
 	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
 
 	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
@@ -210,8 +210,8 @@ _mesa_x86_64_transform_points4_identity:
 
 	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
-	prefetch 64(%rsi)
-	prefetchw 64(%rdi)
+	prefetcht1 64(%rsi)
+	prefetcht1 64(%rdi)
 
 	add %ecx, %ecx
 
@@ -242,7 +242,7 @@ _mesa_3dnow_transform_points4_3d_no_rot:
 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
 
-	prefetch (%rdx)
+	prefetcht1 (%rdx)
 	
 	movd (%rsi), %mm0		/*                 | m00             */
 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
@@ -255,7 +255,7 @@ _mesa_3dnow_transform_points4_3d_no_rot:
 
 p4_3d_no_rot_loop:
 
-	prefetchw 32(%rdi)
+	prefetcht1 32(%rdi)
 	
 	movq  (%rdx), %mm4		/* x1              | x0              */
 	movq  8(%rdx), %mm5		/* x3              | x2              */
@@ -279,7 +279,7 @@ p4_3d_no_rot_loop:
 	addq $16, %rdi
 	
 	decl %ecx
-	prefetch 32(%rdx)
+	prefetcht1 32(%rdx)
 	jnz p4_3d_no_rot_loop
 
 p4_3d_no_rot_done:
@@ -311,7 +311,7 @@ _mesa_3dnow_transform_points4_perspective:
 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
 	
 	movq 32(%rsi), %mm2		/* m21             | m20             */
-	prefetch (%rdx)
+	prefetcht1 (%rdx)
 	
 	movd 40(%rsi), %mm1		/*                 | m22             */
 
@@ -321,7 +321,7 @@ _mesa_3dnow_transform_points4_perspective:
 
 p4_perspective_loop:
 
-	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
 
 	movq (%rdx), %mm4		/* x1              | x0              */
 	movq 8(%rdx), %mm5		/* x3              | x2              */
@@ -347,7 +347,7 @@ p4_perspective_loop:
 	addq $16, %rdi
 
 	decl %ecx
-	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
 	jnz p4_perspective_loop
 
 p4_perspective_done:
@@ -374,14 +374,14 @@ _mesa_3dnow_transform_points4_2d_no_rot:
 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
 
 	movd (%rsi), %mm0		/*                 | m00             */
-	prefetch (%rdx)
+	prefetcht1 (%rdx)
 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
 	
 	movq 48(%rsi), %mm1		/* m31             | m30             */
 
 p4_2d_no_rot_loop:
 
-	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
 
 	movq (%rdx), %mm4		/* x1              | x0              */
 	movq 8(%rdx), %mm5		/* x3              | x2              */
@@ -394,7 +394,7 @@ p4_2d_no_rot_loop:
 	addq %rax, %rdx	
 	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
 
-	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
 	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
 
 	movq %mm6, (%rdi)		/* write r0, r1                      */
@@ -433,7 +433,7 @@ _mesa_3dnow_transform_points4_2d:
 	movd (%rsi), %mm0		/*                 | m00             */
 	movd 4(%rsi), %mm1		/*                 | m01             */
 
-	prefetch (%rdx)
+	prefetcht1 (%rdx)
 
 	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
@@ -443,7 +443,7 @@ _mesa_3dnow_transform_points4_2d:
 
 p4_2d_loop:
 
-	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
 
 	movq (%rdx), %mm3		/* x1              | x0              */
 	movq 8(%rdx), %mm5		/* x3              | x2              */
@@ -460,7 +460,7 @@ p4_2d_loop:
 	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
 
 	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
-	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
 
 	pfadd %mm6, %mm3		/* r1              | r0              */
 
diff --git a/src/util/hash_table.c b/src/util/hash_table.c
index 3247593c1f6..4cfe3d93251 100644
--- a/src/util/hash_table.c
+++ b/src/util/hash_table.c
@@ -163,6 +163,32 @@ _mesa_hash_table_destroy(struct hash_table *ht,
    ralloc_free(ht);
 }
 
+/**
+ * Deletes all entries of the given hash table without deleting the table
+ * itself or changing its structure.
+ *
+ * If delete_function is passed, it gets called on each entry present.
+ */
+void
+_mesa_hash_table_clear(struct hash_table *ht,
+                       void (*delete_function)(struct hash_entry *entry))
+{
+   struct hash_entry *entry;
+
+   for (entry = ht->table; entry != ht->table + ht->size; entry++) {
+      if (entry->key == NULL)
+         continue;
+
+      if (delete_function != NULL && entry->key != ht->deleted_key)
+         delete_function(entry);
+
+      entry->key = NULL;
+   }
+
+   ht->entries = 0;
+   ht->deleted_entries = 0;
+}
+
 /** Sets the value of the key pointer used for deleted entries in the table.
  *
  * The assumption is that usually keys are actual pointers, so we use a
@@ -300,7 +326,8 @@ hash_table_insert(struct hash_table *ht, uint32_t hash,
        * required to avoid memory leaks, perform a search
        * before inserting.
        */
-      if (entry->hash == hash &&
+      if (!entry_is_deleted(ht, entry) &&
+          entry->hash == hash &&
           ht->key_equals_function(key, entry->key)) {
          entry->key = key;
          entry->data = data;
diff --git a/src/util/hash_table.h b/src/util/hash_table.h
index eb9dbc333ec..85b013cac24 100644
--- a/src/util/hash_table.h
+++ b/src/util/hash_table.h
@@ -64,9 +64,16 @@ _mesa_hash_table_create(void *mem_ctx,
                                                     const void *b));
 void _mesa_hash_table_destroy(struct hash_table *ht,
                               void (*delete_function)(struct hash_entry *entry));
+void _mesa_hash_table_clear(struct hash_table *ht,
+                            void (*delete_function)(struct hash_entry *entry));
 void _mesa_hash_table_set_deleted_key(struct hash_table *ht,
                                       const void *deleted_key);
 
+static inline uint32_t _mesa_hash_table_num_entries(struct hash_table *ht)
+{
+   return ht->entries;
+}
+
 struct hash_entry *
 _mesa_hash_table_insert(struct hash_table *ht, const void *key, void *data);
 struct hash_entry *
diff --git a/src/util/set.c b/src/util/set.c
index f01f8699ac2..99abefd0632 100644
--- a/src/util/set.c
+++ b/src/util/set.c
@@ -282,7 +282,8 @@ set_add(struct set *ht, uint32_t hash, const void *key)
        * If freeing of old keys is required to avoid memory leaks,
        * perform a search before inserting.
        */
-      if (entry->hash == hash &&
+      if (!entry_is_deleted(entry) &&
+          entry->hash == hash &&
           ht->key_equals_function(key, entry->key)) {
          entry->key = key;
          return entry;
diff --git a/src/util/tests/hash_table/Makefile.am b/src/util/tests/hash_table/Makefile.am
index 04a77e30df1..8f12240cede 100644
--- a/src/util/tests/hash_table/Makefile.am
+++ b/src/util/tests/hash_table/Makefile.am
@@ -29,6 +29,7 @@ LDADD = \
 	$(DLOPEN_LIBS)
 
 TESTS = \
+	clear \
 	collision \
 	delete_and_lookup \
 	delete_management \
diff --git a/src/util/tests/hash_table/clear.c b/src/util/tests/hash_table/clear.c
new file mode 100644
index 00000000000..526700bfb0f
--- /dev/null
+++ b/src/util/tests/hash_table/clear.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "hash_table.h"
+
+static void *make_key(uint32_t i)
+{
+      return (void *)(uintptr_t)(1 + i);
+}
+
+static uint32_t key_id(const void *key)
+{
+   return (uintptr_t)key - 1;
+}
+
+static uint32_t key_hash(const void *key)
+{
+   return (uintptr_t)key;
+}
+
+static bool key_equal(const void *a, const void *b)
+{
+   return a == b;
+}
+
+static void delete_function(struct hash_entry *entry)
+{
+   bool *deleted = (bool *)entry->data;
+   assert(!*deleted);
+   *deleted = true;
+}
+
+int main()
+{
+   struct hash_table *ht;
+   struct hash_entry *entry;
+   const uint32_t size = 1000;
+   bool flags[size];
+   uint32_t i;
+
+   ht = _mesa_hash_table_create(NULL, key_hash, key_equal);
+
+   for (i = 0; i < size; ++i) {
+      flags[i] = false;
+      _mesa_hash_table_insert(ht, make_key(i), &flags[i]);
+   }
+
+   _mesa_hash_table_clear(ht, delete_function);
+   assert(_mesa_hash_table_next_entry(ht, NULL) == NULL);
+
+   /* Check that delete_function was called and that repopulating the table
+    * works. */
+   for (i = 0; i < size; ++i) {
+      assert(flags[i]);
+      flags[i] = false;
+      _mesa_hash_table_insert(ht, make_key(i), &flags[i]);
+   }
+
+   /* Check that exactly the right set of entries is in the table. */
+   for (i = 0; i < size; ++i) {
+      assert(_mesa_hash_table_search(ht, make_key(i)));
+   }
+
+   hash_table_foreach(ht, entry) {
+      assert(key_id(entry->key) < size);
+   }
+
+   _mesa_hash_table_destroy(ht, NULL);
+
+   return 0;
+}