Merge remote-tracking branch 'mesa-public/master' into vulkan

2026-06-21 10:18:30 +02:00 · 2016-02-05 15:21:11 -08:00 · 2016-02-05 15:21:11 -08:00 · 9401516113
commit 9401516113
parent 741744f691 5b51b2e000
248 changed files with 7167 additions and 2008 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@ -5,6 +5,7 @@
  (c-file-style . "stroustrup")
  (fill-column . 78)
  (eval . (progn
+	    (c-set-offset 'case-label '0)
 	    (c-set-offset 'innamespace '0)
 	    (c-set-offset 'inline-open '0)))
  )
--- a/appveyor.yml
+++ b/appveyor.yml
@ -6,7 +6,7 @@
 # - Select Git and fill in the Git clone URL
 # - Setup a Git hook as explained in
 #   https://github.com/appveyor/webhooks#installing-git-hook
-# - Check 'Settings > General > Skip branches without appveyor'
+# - Check 'Settings > General > Skip branches without appveyor.yml'
 # - Check 'Settings > General > Rolling builds'
 # - Setup the global or project notifications to your liking
 #
@ -24,7 +24,14 @@ branches:
  except:
  - /^travis.*$/

-clone_depth: 5
+# Don't download the full Mesa history to speed up cloning.  However the clone
+# depth must not be too small, otherwise builds might fail when lots of patches
+# are committed in succession, because the desired commit is not found on the
+# truncated history.
+#
+# See also:
+# - https://www.appveyor.com/blog/2014/06/04/shallow-clone-for-git-repositories
+clone_depth: 100

 cache:
 - win_flex_bison-2.4.5.zip
--- a/configure.ac
+++ b/configure.ac
@ -2161,7 +2161,12 @@ gallium_require_drm_loader() {
    fi
 }

+dnl This is for Glamor. Skip this if OpenGL is disabled.
 require_egl_drm() {
+    if test "x$enable_opengl" = xno; then
+        return 0
+    fi
+
    case "$with_egl_platforms" in
        *drm*)
            ;;
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@ -135,7 +135,7 @@ GL 4.2, GLSL 4.20:

  GL_ARB_texture_compression_bptc                      DONE (i965, nvc0, r600, radeonsi)
  GL_ARB_compressed_texture_pixel_storage              DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965)
+  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
  GL_ARB_texture_storage                               DONE (all drivers)
  GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
  GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
@ -164,7 +164,7 @@ GL 4.3, GLSL 4.30:
  GL_ARB_program_interface_query                       DONE (all drivers)
  GL_ARB_robust_buffer_access_behavior                 not started
  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965)
+  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
  GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
  GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
@ -186,7 +186,7 @@ GL 4.4, GLSL 4.40:
  - specified transform/feedback layout                in progress
  - input/output block locations                       DONE
  GL_ARB_multi_bind                                    DONE (all drivers)
-  GL_ARB_query_buffer_object                           not started
+  GL_ARB_query_buffer_object                           DONE (nvc0)
  GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
  GL_ARB_texture_stencil8                              DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
  GL_ARB_vertex_type_10f_11f_11f_rev                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
--- a/docs/envvars.html
+++ b/docs/envvars.html
@ -96,6 +96,7 @@ glGetString(GL_SHADING_LANGUAGE_VERSION). Valid values are integers, such as
 "130".  Mesa will not really implement all the features of the given language version
 if it's higher than what's normally reported. (for developers only)
 <li>MESA_GLSL - <a href="shading.html#envvars">shading language compiler options</a>
+<li>MESA_NO_MINMAX_CACHE - when set, the minmax index cache is globally disabled.
 </ul>


--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@ -48,7 +48,10 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
 <li>GL_ARB_indirect_parameters on nvc0</li>
+<li>GL_ARB_query_buffer_object on nvc0</li>
+<li>GL_ARB_shader_atomic_counters on nvc0</li>
 <li>GL_ARB_shader_draw_parameters on i965, nvc0</li>
+<li>GL_ARB_shader_storage_buffer_object on nvc0</li>
 <li>GL_ARB_tessellation_shader on i965 and r600 (evergreen/cayman only)</li>
 <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
 <li>GL_ARB_texture_buffer_range on freedreno/a4xx</li>
@ -58,6 +61,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_vertex_type_10f_11f_11f_rev on freedreno/a4xx</li>
 <li>GL_KHR_texture_compression_astc_ldr on freedreno/a4xx</li>
 <li>GL_AMD_performance_monitor on radeonsi (CIK+ only)</li>
+<li>GL_ATI_meminfo on r600, radeonsi</li>
+<li>GL_NVX_gpu_memory_info on r600, radeonsi</li>
 <li>New OSMesaCreateContextAttribs() function (for creating core profile
    contexts)</li>
 </ul>
--- a/include/D3D9/d3d9types.h
+++ b/include/D3D9/d3d9types.h
@ -227,6 +227,7 @@ typedef struct _RGNDATA {
 #define D3DERR_DRIVERINVALIDCALL         MAKE_D3DHRESULT(2157)
 #define D3DERR_DEVICEREMOVED             MAKE_D3DHRESULT(2160)
 #define D3DERR_DEVICEHUNG                MAKE_D3DHRESULT(2164)
+#define S_PRESENT_OCCLUDED               MAKE_D3DSTATUS(2168)

 /********************************************************
 * Bitmasks                                             *
--- a/include/d3dadapter/present.h
+++ b/include/d3dadapter/present.h
@ -69,6 +69,8 @@ typedef struct ID3DPresentVtbl
    HRESULT (WINAPI *SetCursor)(ID3DPresent *This, void *pBitmap, POINT *pHotspot, BOOL bShow);
    HRESULT (WINAPI *SetGammaRamp)(ID3DPresent *This, const D3DGAMMARAMP *pRamp, HWND hWndOverride);
    HRESULT (WINAPI *GetWindowInfo)(ID3DPresent *This,  HWND hWnd, int *width, int *height, int *depth);
+    /* Available since version 1.1 */
+    BOOL (WINAPI *GetWindowOccluded)(ID3DPresent *This);
 } ID3DPresentVtbl;

 struct ID3DPresent
@ -96,6 +98,7 @@ struct ID3DPresent
 #define ID3DPresent_SetCursor(p,a,b,c) (p)->lpVtbl->SetCursor(p,a,b,c)
 #define ID3DPresent_SetGammaRamp(p,a,b) (p)->lpVtbl->SetGammaRamp(p,a,b)
 #define ID3DPresent_GetWindowInfo(p,a,b,c,d) (p)->lpVtbl->GetWindowSize(p,a,b,c,d)
+#define ID3DPresent_GetWindowOccluded(p) (p)->lpVtbl->GetWindowOccluded(p)

 typedef struct ID3DPresentGroupVtbl
 {
--- a/src/compiler/.gitignore
+++ b/src/compiler/.gitignore
@ -0,0 +1 @@
+glsl_compiler
--- a/src/compiler/Makefile.am
+++ b/src/compiler/Makefile.am
@ -220,9 +220,11 @@ YACC_GEN = $(AM_V_YACC)$(YACC) $(YFLAGS)
 LEX_GEN = $(AM_V_LEX)$(LEX) $(LFLAGS)

 glsl/glsl_parser.cpp glsl/glsl_parser.h: glsl/glsl_parser.yy
+	$(MKDIR_GEN)
 	$(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl/glsl_parser.h $(srcdir)/glsl/glsl_parser.yy

 glsl/glsl_lexer.cpp: glsl/glsl_lexer.ll
+	$(MKDIR_GEN)
 	$(LEX_GEN) -o $@ $(srcdir)/glsl/glsl_lexer.ll

 glsl/glcpp/glcpp-parse.c glsl/glcpp/glcpp-parse.h: glsl/glcpp/glcpp-parse.y
--- a/src/compiler/glsl/.gitignore
+++ b/src/compiler/glsl/.gitignore
@ -1,4 +1,3 @@
-glsl_compiler
 glsl_lexer.cpp
 glsl_parser.cpp
 glsl_parser.h
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@ -291,6 +291,10 @@ apply_implicit_conversion(const glsl_type *to, ir_rvalue * &from,
   if (!state->is_version(120, 0))
      return false;

+   /* ESSL does not allow implicit conversions */
+   if (state->es_shader)
+      return false;
+
   /* From page 27 (page 33 of the PDF) of the GLSL 1.50 spec:
    *
    *    "There are no implicit array or structure conversions. For
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@ -661,7 +661,7 @@ private:
   BA1(roundEven)
   BA1(ceil)
   BA1(fract)
-   B2(mod)
+   BA2(mod)
   BA1(modf)
   BA2(min)
   BA2(max)
@ -1242,23 +1242,23 @@ builtin_builder::create_builtins()
   FD(fract)

   add_function("mod",
-                _mod(glsl_type::float_type, glsl_type::float_type),
-                _mod(glsl_type::vec2_type,  glsl_type::float_type),
-                _mod(glsl_type::vec3_type,  glsl_type::float_type),
-                _mod(glsl_type::vec4_type,  glsl_type::float_type),
+                _mod(always_available, glsl_type::float_type, glsl_type::float_type),
+                _mod(always_available, glsl_type::vec2_type,  glsl_type::float_type),
+                _mod(always_available, glsl_type::vec3_type,  glsl_type::float_type),
+                _mod(always_available, glsl_type::vec4_type,  glsl_type::float_type),

-                _mod(glsl_type::vec2_type,  glsl_type::vec2_type),
-                _mod(glsl_type::vec3_type,  glsl_type::vec3_type),
-                _mod(glsl_type::vec4_type,  glsl_type::vec4_type),
+                _mod(always_available, glsl_type::vec2_type,  glsl_type::vec2_type),
+                _mod(always_available, glsl_type::vec3_type,  glsl_type::vec3_type),
+                _mod(always_available, glsl_type::vec4_type,  glsl_type::vec4_type),

-                _mod(glsl_type::double_type, glsl_type::double_type),
-                _mod(glsl_type::dvec2_type,  glsl_type::double_type),
-                _mod(glsl_type::dvec3_type,  glsl_type::double_type),
-                _mod(glsl_type::dvec4_type,  glsl_type::double_type),
+                _mod(fp64, glsl_type::double_type, glsl_type::double_type),
+                _mod(fp64, glsl_type::dvec2_type,  glsl_type::double_type),
+                _mod(fp64, glsl_type::dvec3_type,  glsl_type::double_type),
+                _mod(fp64, glsl_type::dvec4_type,  glsl_type::double_type),

-                _mod(glsl_type::dvec2_type,  glsl_type::dvec2_type),
-                _mod(glsl_type::dvec3_type,  glsl_type::dvec3_type),
-                _mod(glsl_type::dvec4_type,  glsl_type::dvec4_type),
+                _mod(fp64, glsl_type::dvec2_type,  glsl_type::dvec2_type),
+                _mod(fp64, glsl_type::dvec3_type,  glsl_type::dvec3_type),
+                _mod(fp64, glsl_type::dvec4_type,  glsl_type::dvec4_type),
                NULL);

   FD(modf)
@ -3452,9 +3452,10 @@ UNOPA(ceil,      ir_unop_ceil)
 UNOPA(fract,     ir_unop_fract)

 ir_function_signature *
-builtin_builder::_mod(const glsl_type *x_type, const glsl_type *y_type)
+builtin_builder::_mod(builtin_available_predicate avail,
+                      const glsl_type *x_type, const glsl_type *y_type)
 {
-   return binop(always_available, ir_binop_mod, x_type, x_type, y_type);
+   return binop(avail, ir_binop_mod, x_type, x_type, y_type);
 }

 ir_function_signature *
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@ -328,6 +328,11 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
   this->fields[this->num_fields].sample = 0;
   this->fields[this->num_fields].patch = 0;
   this->fields[this->num_fields].precision = GLSL_PRECISION_NONE;
+   this->fields[this->num_fields].image_read_only = 0;
+   this->fields[this->num_fields].image_write_only = 0;
+   this->fields[this->num_fields].image_coherent = 0;
+   this->fields[this->num_fields].image_volatile = 0;
+   this->fields[this->num_fields].image_restrict = 0;
   this->num_fields++;
 }

@ -1201,7 +1206,12 @@ builtin_variable_generator::generate_varyings()
   /* gl_Position and gl_PointSize are not visible from fragment shaders. */
   if (state->stage != MESA_SHADER_FRAGMENT) {
      add_varying(VARYING_SLOT_POS, vec4_t, "gl_Position");
-      add_varying(VARYING_SLOT_PSIZ, float_t, "gl_PointSize");
+      if (!state->es_shader ||
+          state->stage == MESA_SHADER_VERTEX ||
+          (state->stage == MESA_SHADER_GEOMETRY &&
+           state->OES_geometry_point_size_enable)) {
+         add_varying(VARYING_SLOT_PSIZ, float_t, "gl_PointSize");
+      }
   }

   if (state->is_version(130, 0)) {
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@ -2386,6 +2386,13 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
                 add_builtin_define(parser, "GL_OES_texture_storage_multisample_2d_array", 1);
              if (extensions->ARB_blend_func_extended)
                 add_builtin_define(parser, "GL_EXT_blend_func_extended", 1);
+
+              if (version >= 310) {
+                 if (extensions->OES_geometry_shader) {
+                    add_builtin_define(parser, "GL_OES_geometry_point_size", 1);
+                    add_builtin_define(parser, "GL_OES_geometry_shader", 1);
+                 }
+              }
 	   }
 	} else {
 	   add_builtin_define(parser, "GL_ARB_draw_buffers", 1);
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@ -600,6 +600,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
   /* OES extensions go here, sorted alphabetically.
    */
   EXT(OES_EGL_image_external,         false, true,      OES_EGL_image_external),
+   EXT(OES_geometry_point_size,        false, true,      OES_geometry_shader),
   EXT(OES_geometry_shader,            false, true,      OES_geometry_shader),
   EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
   EXT(OES_texture_3D,                 false, true,      dummy_true),
@ -1867,59 +1868,76 @@ do_common_optimization(exec_list *ir, bool linked,
                       const struct gl_shader_compiler_options *options,
                       bool native_integers)
 {
+   const bool debug = false;
   GLboolean progress = GL_FALSE;

-   progress = lower_instructions(ir, SUB_TO_ADD_NEG) || progress;
+#define OPT(PASS, ...) do {                                             \
+      if (debug) {                                                      \
+         fprintf(stderr, "START GLSL optimization %s\n", #PASS);        \
+         const bool opt_progress = PASS(__VA_ARGS__);                   \
+         progress = opt_progress || progress;                           \
+         if (opt_progress)                                              \
+            _mesa_print_ir(stderr, ir, NULL);                           \
+         fprintf(stderr, "GLSL optimization %s: %s progress\n",         \
+                 #PASS, opt_progress ? "made" : "no");                  \
+      } else {                                                          \
+         progress = PASS(__VA_ARGS__) || progress;                      \
+      }                                                                 \
+   } while (false)
+
+   OPT(lower_instructions, ir, SUB_TO_ADD_NEG);

   if (linked) {
-      progress = do_function_inlining(ir) || progress;
-      progress = do_dead_functions(ir) || progress;
-      progress = do_structure_splitting(ir) || progress;
+      OPT(do_function_inlining, ir);
+      OPT(do_dead_functions, ir);
+      OPT(do_structure_splitting, ir);
   }
-   progress = do_if_simplification(ir) || progress;
-   progress = opt_flatten_nested_if_blocks(ir) || progress;
-   progress = opt_conditional_discard(ir) || progress;
-   progress = do_copy_propagation(ir) || progress;
-   progress = do_copy_propagation_elements(ir) || progress;
+   OPT(do_if_simplification, ir);
+   OPT(opt_flatten_nested_if_blocks, ir);
+   OPT(opt_conditional_discard, ir);
+   OPT(do_copy_propagation, ir);
+   OPT(do_copy_propagation_elements, ir);

   if (options->OptimizeForAOS && !linked)
-      progress = opt_flip_matrices(ir) || progress;
+      OPT(opt_flip_matrices, ir);

   if (linked && options->OptimizeForAOS) {
-      progress = do_vectorize(ir) || progress;
+      OPT(do_vectorize, ir);
   }

   if (linked)
-      progress = do_dead_code(ir, uniform_locations_assigned) || progress;
+      OPT(do_dead_code, ir, uniform_locations_assigned);
   else
-      progress = do_dead_code_unlinked(ir) || progress;
-   progress = do_dead_code_local(ir) || progress;
-   progress = do_tree_grafting(ir) || progress;
-   progress = do_constant_propagation(ir) || progress;
+      OPT(do_dead_code_unlinked, ir);
+   OPT(do_dead_code_local, ir);
+   OPT(do_tree_grafting, ir);
+   OPT(do_constant_propagation, ir);
   if (linked)
-      progress = do_constant_variable(ir) || progress;
+      OPT(do_constant_variable, ir);
   else
-      progress = do_constant_variable_unlinked(ir) || progress;
-   progress = do_constant_folding(ir) || progress;
-   progress = do_minmax_prune(ir) || progress;
-   progress = do_rebalance_tree(ir) || progress;
-   progress = do_algebraic(ir, native_integers, options) || progress;
-   progress = do_lower_jumps(ir) || progress;
-   progress = do_vec_index_to_swizzle(ir) || progress;
-   progress = lower_vector_insert(ir, false) || progress;
-   progress = do_swizzle_swizzle(ir) || progress;
-   progress = do_noop_swizzle(ir) || progress;
+      OPT(do_constant_variable_unlinked, ir);
+   OPT(do_constant_folding, ir);
+   OPT(do_minmax_prune, ir);
+   OPT(do_rebalance_tree, ir);
+   OPT(do_algebraic, ir, native_integers, options);
+   OPT(do_lower_jumps, ir);
+   OPT(do_vec_index_to_swizzle, ir);
+   OPT(lower_vector_insert, ir, false);
+   OPT(do_swizzle_swizzle, ir);
+   OPT(do_noop_swizzle, ir);

-   progress = optimize_split_arrays(ir, linked) || progress;
-   progress = optimize_redundant_jumps(ir) || progress;
+   OPT(optimize_split_arrays, ir, linked);
+   OPT(optimize_redundant_jumps, ir);

   loop_state *ls = analyze_loop_variables(ir);
   if (ls->loop_found) {
-      progress = set_loop_controls(ir, ls) || progress;
-      progress = unroll_loops(ir, ls, options) || progress;
+      OPT(set_loop_controls, ir, ls);
+      OPT(unroll_loops, ir, ls, options);
   }
   delete ls;

+#undef OPT
+
   return progress;
 }

--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@ -591,6 +591,8 @@ struct _mesa_glsl_parse_state {
    */
   bool OES_EGL_image_external_enable;
   bool OES_EGL_image_external_warn;
+   bool OES_geometry_point_size_enable;
+   bool OES_geometry_point_size_warn;
   bool OES_geometry_shader_enable;
   bool OES_geometry_shader_warn;
   bool OES_standard_derivatives_enable;
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@ -471,10 +471,11 @@ private:
 */
 class parcel_out_uniform_storage : public program_resource_visitor {
 public:
-   parcel_out_uniform_storage(struct string_to_uint_map *map,
+   parcel_out_uniform_storage(struct gl_shader_program *prog,
+                              struct string_to_uint_map *map,
 			      struct gl_uniform_storage *uniforms,
 			      union gl_constant_value *values)
-      : map(map), uniforms(uniforms), values(values)
+      : prog(prog), map(map), uniforms(uniforms), values(values)
   {
   }

@ -492,8 +493,7 @@ public:
      memset(this->targets, 0, sizeof(this->targets));
   }

-   void set_and_process(struct gl_shader_program *prog,
-			ir_variable *var)
+   void set_and_process(ir_variable *var)
   {
      current_var = var;
      field_counter = 0;
@ -643,6 +643,16 @@ private:
         uniform->opaque[shader_type].index = this->next_image;
         uniform->opaque[shader_type].active = true;

+         /* Set image access qualifiers */
+         const GLenum access =
+            (current_var->data.image_read_only ? GL_READ_ONLY :
+             current_var->data.image_write_only ? GL_WRITE_ONLY :
+                GL_READ_WRITE);
+
+         for (unsigned j = 0; j < MAX2(1, uniform->array_elements); ++j)
+            prog->_LinkedShaders[shader_type]->
+               ImageAccess[this->next_image + j] = access;
+
         /* Increment the image index by 1 for non-arrays and by the
          * number of array elements for arrays.
          */
@ -844,6 +854,11 @@ private:
      this->values += values_for_type(type);
   }

+   /**
+    * Current program being processed.
+    */
+   struct gl_shader_program *prog;
+
   struct string_to_uint_map *map;

   struct gl_uniform_storage *uniforms;
@ -1007,40 +1022,6 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
   }
 }

-static void
-link_set_image_access_qualifiers(struct gl_shader_program *prog,
-                                 gl_shader *sh, unsigned shader_stage,
-                                 ir_variable *var, const glsl_type *type,
-                                 char **name, size_t name_length)
-{
-   /* Handle arrays of arrays */
-   if (type->is_array() && type->fields.array->is_array()) {
-      for (unsigned i = 0; i < type->length; i++) {
-	 size_t new_length = name_length;
-
-	 /* Append the subscript to the current variable name */
-	 ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
-
-         link_set_image_access_qualifiers(prog, sh, shader_stage, var,
-                                          type->fields.array, name,
-                                          new_length);
-      }
-   } else {
-      unsigned id = 0;
-      bool found = prog->UniformHash->get(id, *name);
-      assert(found);
-      (void) found;
-      const gl_uniform_storage *storage = &prog->UniformStorage[id];
-      const unsigned index = storage->opaque[shader_stage].index;
-      const GLenum access = (var->data.image_read_only ? GL_READ_ONLY :
-                             var->data.image_write_only ? GL_WRITE_ONLY :
-                             GL_READ_WRITE);
-
-      for (unsigned j = 0; j < MAX2(1, storage->array_elements); ++j)
-         sh->ImageAccess[index + j] = access;
-   }
-}
-
 /**
 * Combine the hidden uniform hash map with the uniform hash map so that the
 * hidden uniforms will be given indicies at the end of the uniform storage
@ -1148,7 +1129,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
   union gl_constant_value *data_end = &data[num_data_slots];
 #endif

-   parcel_out_uniform_storage parcel(prog->UniformHash, uniforms, data);
+   parcel_out_uniform_storage parcel(prog, prog->UniformHash, uniforms, data);

   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
      if (prog->_LinkedShaders[i] == NULL)
@ -1163,7 +1144,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
                               var->data.mode != ir_var_shader_storage))
 	    continue;

-	 parcel.set_and_process(prog, var);
+	 parcel.set_and_process(var);
      }

      prog->_LinkedShaders[i]->active_samplers = parcel.shader_samplers_used;
@ -1301,29 +1282,6 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
   prog->NumHiddenUniforms = hidden_uniforms;
   prog->UniformStorage = uniforms;

-   /**
-    * Scan the program for image uniforms and store image unit access
-    * information into the gl_shader data structure.
-    */
-   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      gl_shader *sh = prog->_LinkedShaders[i];
-
-      if (sh == NULL)
-	 continue;
-
-      foreach_in_list(ir_instruction, node, sh->ir) {
-	 ir_variable *var = node->as_variable();
-
-         if (var && var->data.mode == ir_var_uniform &&
-             var->type->contains_image()) {
-            char *name_copy = ralloc_strdup(NULL, var->name);
-            link_set_image_access_qualifiers(prog, sh, i, var, var->type,
-                                             &name_copy, strlen(var->name));
-            ralloc_free(name_copy);
-         }
-      }
-   }
-
   link_set_uniform_initializers(prog, boolean_true);

   return;
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@ -967,11 +967,16 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
      return;
   }

-   if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
+   bool needs_flat_qualifier = consumer_var == NULL &&
+      (producer_var->type->contains_integer() ||
+       producer_var->type->contains_double());
+
+   if (needs_flat_qualifier ||
       (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT)) {
      /* Since this varying is not being consumed by the fragment shader, its
       * interpolation type varying cannot possibly affect rendering.
-       * Also, this variable is non-flat and is (or contains) an integer.
+       * Also, this variable is non-flat and is (or contains) an integer
+       * or a double.
       * If the consumer stage is unknown, don't modify the interpolation
       * type as it could affect rendering later with separate shaders.
       *
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@ -4633,8 +4633,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                        &prog->NumShaderStorageBlocks,
                        &prog->SsboInterfaceBlockIndex);

-   /* FINISHME: Assign fragment shader output locations. */
-
   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
      if (prog->_LinkedShaders[i] == NULL)
 	 continue;
--- a/src/compiler/glsl/lower_buffer_access.cpp
+++ b/src/compiler/glsl/lower_buffer_access.cpp
@ -327,6 +327,7 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
                                         unsigned *const_offset,
                                         bool *row_major,
                                         int *matrix_columns,
+                                         const glsl_struct_field **struct_field,
                                         unsigned packing)
 {
   *offset = new(mem_ctx) ir_constant(0u);
@ -442,8 +443,11 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
            intra_struct_offset = glsl_align(intra_struct_offset, field_align);

            if (strcmp(struct_type->fields.structure[i].name,
-                       deref_record->field) == 0)
+                       deref_record->field) == 0) {
+               if (struct_field)
+                  *struct_field = &struct_type->fields.structure[i];
               break;
+            }

            if (packing == GLSL_INTERFACE_PACKING_STD430)
               intra_struct_offset += type->std430_size(field_row_major);
--- a/src/compiler/glsl/lower_buffer_access.h
+++ b/src/compiler/glsl/lower_buffer_access.h
@ -57,6 +57,7 @@ public:
   void setup_buffer_access(void *mem_ctx, ir_variable *var, ir_rvalue *deref,
                            ir_rvalue **offset, unsigned *const_offset,
                            bool *row_major, int *matrix_columns,
+                            const glsl_struct_field **struct_field,
                            unsigned packing);
 };

--- a/src/compiler/glsl/lower_shared_reference.cpp
+++ b/src/compiler/glsl/lower_shared_reference.cpp
@ -142,7 +142,7 @@ lower_shared_reference_visitor::handle_rvalue(ir_rvalue **rvalue)

   setup_buffer_access(mem_ctx, var, deref,
                       &offset, &const_offset,
-                       &row_major, &matrix_columns, packing);
+                       &row_major, &matrix_columns, NULL, packing);

   /* Now that we've calculated the offset to the start of the
    * dereference, walk over the type and emit loads into a temporary.
@ -210,7 +210,7 @@ lower_shared_reference_visitor::handle_assignment(ir_assignment *ir)

   setup_buffer_access(mem_ctx, var, deref,
                       &offset, &const_offset,
-                       &row_major, &matrix_columns, packing);
+                       &row_major, &matrix_columns, NULL, packing);

   deref = new(mem_ctx) ir_dereference_variable(store_var);

@ -370,7 +370,7 @@ lower_shared_reference_visitor::lower_shared_atomic_intrinsic(ir_call *ir)

   setup_buffer_access(mem_ctx, var, deref,
                       &offset, &const_offset,
-                       &row_major, &matrix_columns, packing);
+                       &row_major, &matrix_columns, NULL, packing);

   assert(offset);
   assert(!row_major);
--- a/src/compiler/glsl/lower_ubo_reference.cpp
+++ b/src/compiler/glsl/lower_ubo_reference.cpp
@ -45,7 +45,7 @@ class lower_ubo_reference_visitor :
      public lower_buffer_access::lower_buffer_access {
 public:
   lower_ubo_reference_visitor(struct gl_shader *shader)
-   : shader(shader)
+   : shader(shader), struct_field(NULL), variable(NULL)
   {
   }

@ -60,6 +60,7 @@ public:
                                bool *row_major,
                                int *matrix_columns,
                                unsigned packing);
+   uint32_t ssbo_access_params();
   ir_expression *ubo_load(void *mem_ctx, const struct glsl_type *type,
 			   ir_rvalue *offset);
   ir_call *ssbo_load(void *mem_ctx, const struct glsl_type *type,
@ -104,6 +105,8 @@ public:

   struct gl_shader *shader;
   struct gl_uniform_buffer_variable *ubo_var;
+   const struct glsl_struct_field *struct_field;
+   ir_variable *variable;
   ir_rvalue *uniform_block;
   bool progress;
 };
@ -288,8 +291,9 @@ lower_ubo_reference_visitor::setup_for_load_or_store(void *mem_ctx,

   *const_offset = ubo_var->Offset;

+   this->struct_field = NULL;
   setup_buffer_access(mem_ctx, var, deref, offset, const_offset, row_major,
-                       matrix_columns, packing);
+                       matrix_columns, &this->struct_field, packing);
 }

 void
@ -317,6 +321,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
   this->buffer_access_type =
      var->is_in_shader_storage_block() ?
      ssbo_load_access : ubo_load_access;
+   this->variable = var;

   /* Compute the offset to the start if the dereference as well as other
    * information we need to configure the write
@ -370,6 +375,24 @@ shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
   return state->ARB_shader_storage_buffer_object_enable;
 }

+uint32_t
+lower_ubo_reference_visitor::ssbo_access_params()
+{
+   assert(variable);
+
+   if (variable->is_interface_instance()) {
+      assert(struct_field);
+
+      return ((struct_field->image_coherent ? ACCESS_COHERENT : 0) |
+              (struct_field->image_restrict ? ACCESS_RESTRICT : 0) |
+              (struct_field->image_volatile ? ACCESS_VOLATILE : 0));
+   } else {
+      return ((variable->data.image_coherent ? ACCESS_COHERENT : 0) |
+              (variable->data.image_restrict ? ACCESS_RESTRICT : 0) |
+              (variable->data.image_volatile ? ACCESS_VOLATILE : 0));
+   }
+}
+
 ir_call *
 lower_ubo_reference_visitor::ssbo_store(void *mem_ctx,
                                        ir_rvalue *deref,
@ -394,6 +417,10 @@ lower_ubo_reference_visitor::ssbo_store(void *mem_ctx,
      ir_variable(glsl_type::uint_type, "write_mask" , ir_var_function_in);
   sig_params.push_tail(writemask_ref);

+   ir_variable *access_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "access" , ir_var_function_in);
+   sig_params.push_tail(access_ref);
+
   ir_function_signature *sig = new(mem_ctx)
      ir_function_signature(glsl_type::void_type, shader_storage_buffer_object);
   assert(sig);
@ -408,6 +435,7 @@ lower_ubo_reference_visitor::ssbo_store(void *mem_ctx,
   call_params.push_tail(offset->clone(mem_ctx, NULL));
   call_params.push_tail(deref->clone(mem_ctx, NULL));
   call_params.push_tail(new(mem_ctx) ir_constant(write_mask));
+   call_params.push_tail(new(mem_ctx) ir_constant(ssbo_access_params()));
   return new(mem_ctx) ir_call(sig, NULL, &call_params);
 }

@ -426,6 +454,10 @@ lower_ubo_reference_visitor::ssbo_load(void *mem_ctx,
      ir_variable(glsl_type::uint_type, "offset_ref" , ir_var_function_in);
   sig_params.push_tail(offset_ref);

+   ir_variable *access_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "access" , ir_var_function_in);
+   sig_params.push_tail(access_ref);
+
   ir_function_signature *sig =
      new(mem_ctx) ir_function_signature(type, shader_storage_buffer_object);
   assert(sig);
@ -444,6 +476,7 @@ lower_ubo_reference_visitor::ssbo_load(void *mem_ctx,
   exec_list call_params;
   call_params.push_tail(this->uniform_block->clone(mem_ctx, NULL));
   call_params.push_tail(offset->clone(mem_ctx, NULL));
+   call_params.push_tail(new(mem_ctx) ir_constant(ssbo_access_params()));

   return new(mem_ctx) ir_call(sig, deref_result, &call_params);
 }
@ -499,6 +532,7 @@ lower_ubo_reference_visitor::write_to_memory(void *mem_ctx,
   unsigned packing = var->get_interface_type()->interface_packing;

   this->buffer_access_type = ssbo_store_access;
+   this->variable = var;

   /* Compute the offset to the start if the dereference as well as other
    * information we need to configure the write
@ -678,6 +712,7 @@ lower_ubo_reference_visitor::process_ssbo_unsized_array_length(ir_rvalue **rvalu
   int unsized_array_stride = calculate_unsized_array_stride(deref, packing);

   this->buffer_access_type = ssbo_unsized_array_length_access;
+   this->variable = var;

   /* Compute the offset to the start if the dereference as well as other
    * information we need to calculate the length.
@ -910,6 +945,7 @@ lower_ubo_reference_visitor::lower_ssbo_atomic_intrinsic(ir_call *ir)
   unsigned packing = var->get_interface_type()->interface_packing;

   this->buffer_access_type = ssbo_atomic_access;
+   this->variable = var;

   setup_for_load_or_store(mem_ctx, var, deref,
                           &offset, &const_offset,
--- a/src/compiler/glsl/opt_tree_grafting.cpp
+++ b/src/compiler/glsl/opt_tree_grafting.cpp
@ -361,11 +361,12 @@ tree_grafting_basic_block(ir_instruction *bb_first,
      if (!lhs_var)
 	 continue;

-   if (lhs_var->data.mode == ir_var_function_out ||
-       lhs_var->data.mode == ir_var_function_inout ||
-       lhs_var->data.mode == ir_var_shader_out ||
-       lhs_var->data.mode == ir_var_shader_storage)
-      continue;
+      if (lhs_var->data.mode == ir_var_function_out ||
+          lhs_var->data.mode == ir_var_function_inout ||
+          lhs_var->data.mode == ir_var_shader_out ||
+          lhs_var->data.mode == ir_var_shader_storage ||
+          lhs_var->data.mode == ir_var_shader_shared)
+         continue;

      ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);

--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@ -164,6 +164,11 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
      this->fields.structure[i].sample = fields[i].sample;
      this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
      this->fields.structure[i].patch = fields[i].patch;
+      this->fields.structure[i].image_read_only = fields[i].image_read_only;
+      this->fields.structure[i].image_write_only = fields[i].image_write_only;
+      this->fields.structure[i].image_coherent = fields[i].image_coherent;
+      this->fields.structure[i].image_volatile = fields[i].image_volatile;
+      this->fields.structure[i].image_restrict = fields[i].image_restrict;
      this->fields.structure[i].precision = fields[i].precision;
   }

@ -1330,6 +1335,13 @@ glsl_type::can_implicitly_convert_to(const glsl_type *desired,
   if (this == desired)
      return true;

+   /* ESSL does not allow implicit conversions. If there is no state, we're
+    * doing intra-stage function linking where these checks have already been
+    * done.
+    */
+   if (state && state->es_shader)
+      return false;
+
   /* There is no conversion among matrix types. */
   if (this->matrix_columns > 1 || desired->matrix_columns > 1)
      return false;
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@ -885,7 +885,8 @@ struct glsl_struct_field {
   glsl_struct_field(const struct glsl_type *_type, const char *_name)
      : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
-        precision(GLSL_PRECISION_NONE)
+        precision(GLSL_PRECISION_NONE), image_read_only(0), image_write_only(0),
+        image_coherent(0), image_volatile(0), image_restrict(0)
   {
      /* empty */
   }
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@ -139,7 +139,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
             b->shader->options->lower_pack_unorm_2x16);

      nir_ssa_def *word =
-         nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+         nir_extract_u16(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
      nir_ssa_def *val =
         nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
                                nir_channel(b, word, 0));
@ -154,7 +154,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
             b->shader->options->lower_pack_unorm_4x8);

      nir_ssa_def *byte =
-         nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+         nir_extract_u8(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
      nir_ssa_def *val =
         nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
                               nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@ -238,15 +238,15 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")

-unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """
-dst = (src0.x & 0xffff) | (src0.y >> 16);
+unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """
+dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 """)

-unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """
-dst = (src0.x <<  0) |
-      (src0.y <<  8) |
-      (src0.z << 16) |
-      (src0.w << 24);
+unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """
+dst.x = (src0.x <<  0) |
+        (src0.y <<  8) |
+        (src0.z << 16) |
+        (src0.w << 24);
 """)

 # Lowered floating point unpacking operations.
@ -562,12 +562,12 @@ dst.y = src1.x;
 """)

 # Byte extraction
-binop("extract_ubyte", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
-binop("extract_ibyte", tint, "", "(int8_t)(src0 >> (src1 * 8))")
+binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
+binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")

 # Word extraction
-binop("extract_uword", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
-binop("extract_iword", tint, "", "(int16_t)(src0 >> (src1 * 16))")
+binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
+binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")


 def triop(name, ty, const_expr):
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -248,19 +248,19 @@ optimizations = [
              ('ubfe', 'value', 'offset', 'bits')),
    'options->lower_bitfield_extract'),

-   (('extract_ibyte', a, b),
-    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 8),
+   (('extract_i8', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
    'options->lower_extract_byte'),

-   (('extract_ubyte', a, b),
+   (('extract_u8', a, b),
    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
    'options->lower_extract_byte'),

-   (('extract_iword', a, b),
+   (('extract_i16', a, b),
    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
    'options->lower_extract_word'),

-   (('extract_uword', a, b),
+   (('extract_u16', a, b),
    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
    'options->lower_extract_word'),

@ -285,30 +285,30 @@ optimizations = [
     'options->lower_pack_snorm_4x8'),

    (('unpack_unorm_2x16', 'v'),
-     ('fdiv', ('u2f', ('vec4', ('extract_uword', 'v', 0),
-                               ('extract_uword', 'v', 1), 0, 0)),
+     ('fdiv', ('u2f', ('vec2', ('extract_u16', 'v', 0),
+                               ('extract_u16', 'v', 1))),
              65535.0),
     'options->lower_unpack_unorm_2x16'),

    (('unpack_unorm_4x8', 'v'),
-     ('fdiv', ('u2f', ('vec4', ('extract_ubyte', 'v', 0),
-                               ('extract_ubyte', 'v', 1),
-                               ('extract_ubyte', 'v', 2),
-                               ('extract_ubyte', 'v', 3))),
+     ('fdiv', ('u2f', ('vec4', ('extract_u8', 'v', 0),
+                               ('extract_u8', 'v', 1),
+                               ('extract_u8', 'v', 2),
+                               ('extract_u8', 'v', 3))),
              255.0),
     'options->lower_unpack_unorm_4x8'),

    (('unpack_snorm_2x16', 'v'),
-     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_iword', 'v', 0),
-                                                            ('extract_iword', 'v', 1), 0, 0)),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
+                                                            ('extract_i16', 'v', 1))),
                                           32767.0))),
     'options->lower_unpack_snorm_2x16'),

    (('unpack_snorm_4x8', 'v'),
-     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_ibyte', 'v', 0),
-                                                            ('extract_ibyte', 'v', 1),
-                                                            ('extract_ibyte', 'v', 2),
-                                                            ('extract_ibyte', 'v', 3))),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
+                                                            ('extract_i8', 'v', 1),
+                                                            ('extract_i8', 'v', 2),
+                                                            ('extract_i8', 'v', 3))),
                                           127.0))),
     'options->lower_unpack_snorm_4x8'),
 ]
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@ -544,6 +544,16 @@ enum gl_frag_depth_layout
   FRAG_DEPTH_LAYOUT_UNCHANGED
 };

+/**
+ * \brief Buffer access qualifiers
+ */
+enum gl_buffer_access_qualifier
+{
+   ACCESS_COHERENT = 1,
+   ACCESS_RESTRICT = 2,
+   ACCESS_VOLATILE = 4,
+};
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@ -85,7 +85,7 @@ endif

 # virgl
 ifneq ($(filter virgl, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += winsys/virgl/drm drivers/virgl
+SUBDIRS += winsys/virgl/drm winsys/virgl/vtest drivers/virgl
 endif

 # vmwgfx
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
 *
 * Convert float32 to half floats, preserving Infs and NaNs,
 * with rounding towards zero (trunc).
+ * XXX: For GL, would prefer rounding towards nearest(-even).
 */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
   struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
   LLVMValueRef result;

+   /*
+    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
+    * directly, without any (x86 or generic) intrinsics.
+    * Albeit the rounding mode cannot be specified (and is undefined,
+    * though in practice on x86 seems to do nearest-even but it may
+    * be dependent on instruction set support), so is essentially
+    * useless.
+    */
+
   if (util_cpu_caps.has_f16c &&
       (length == 4 || length == 8)) {
      struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
        LLVMValueRef index = LLVMConstInt(i32t, i, 0);
        LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 #if 0
-        /* XXX: not really supported by backends */
+        /*
+         * XXX: not really supported by backends.
+         * Even if they would now, rounding mode cannot be specified and
+         * is undefined.
+         */
        LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 #else
        LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@ -256,6 +256,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
 }


+/**
+ * Un-interleave vector.
+ * This will return a vector consisting of every second element
+ * (depending on lo_hi, beginning at 0 or 1).
+ * The returned vector size (elems and width) will only be half
+ * that of the source vector.
+ */
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi)
+{
+   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
+
+   for (i = 0; i < num_elems / 2; ++i)
+      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
+
+   shuffle = LLVMConstVector(elems, num_elems / 2);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
+}
+
+
 /**
 * Interleave vector elements.
 *
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
                     LLVMValueRef b,
                     unsigned lo_hi);

+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi);

 void
 lp_build_unpack2(struct gallivm_state *gallivm,
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
   /* Ignore deprecated instructions */
   switch (inst->Instruction.Opcode) {

-   case TGSI_OPCODE_UP2H:
   case TGSI_OPCODE_UP2US:
   case TGSI_OPCODE_UP4B:
   case TGSI_OPCODE_UP4UB:
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@ -45,8 +45,10 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"

 #include "tgsi/tgsi_exec.h"

@ -530,6 +532,77 @@ static struct lp_build_tgsi_action log_action = {
   log_emit	 /* emit */
 };

+/* TGSI_OPCODE_PK2H */
+
+static void
+pk2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_X);
+   /* src0.y */
+   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_Y);
+}
+
+static void
+pk2h_emit(
+   const struct lp_build_tgsi_action *action,
+   struct lp_build_tgsi_context *bld_base,
+   struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_type f16i_t;
+   LLVMValueRef lo, hi, res;
+
+   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
+   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
+   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
+   /* maybe some interleave doubling vector width would be useful... */
+   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length * 2);
+   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length * 2);
+   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
+
+   emit_data->output[emit_data->chan] = res;
+}
+
+static struct lp_build_tgsi_action pk2h_action = {
+   pk2h_fetch_args, /* fetch_args */
+   pk2h_emit        /* emit */
+};
+
+/* TGSI_OPCODE_UP2H */
+
+static void
+up2h_emit(
+   const struct lp_build_tgsi_action *action,
+   struct lp_build_tgsi_context *bld_base,
+   struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef lo, hi, res[2], arg;
+   unsigned nr = bld_base->base.type.length;
+   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context), nr * 2);
+
+   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
+   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
+   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
+   res[0] = lp_build_half_to_float(gallivm, lo);
+   res[1] = lp_build_half_to_float(gallivm, hi);
+
+   emit_data->output[0] = emit_data->output[2] = res[0];
+   emit_data->output[1] = emit_data->output[3] = res[1];
+}
+
+static struct lp_build_tgsi_action up2h_action = {
+   scalar_unary_fetch_args, /* fetch_args */
+   up2h_emit                /* emit */
+};
+
 /* TGSI_OPCODE_LRP */

 static void
@ -1032,10 +1105,12 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
   bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
   bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
   bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
+   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
   bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
   bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
   bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
   bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
+   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
   bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;

   bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@ -226,14 +226,9 @@ pipe_freedreno_create_screen(int fd)
 struct pipe_screen *
 pipe_virgl_create_screen(int fd)
 {
-   struct virgl_winsys *vws;
   struct pipe_screen *screen;

-   vws = virgl_drm_winsys_create(fd);
-   if (!vws)
-      return NULL;
-
-   screen = virgl_create_screen(vws);
+   screen = virgl_drm_screen_create(fd);
   return screen ? debug_screen_wrap(screen) : NULL;
 }

--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@ -58,6 +58,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_half.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"

@ -3057,6 +3058,45 @@ exec_dp2(struct tgsi_exec_machine *mach,
   }
 }

+static void
+exec_pk2h(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned chan;
+   union tgsi_exec_channel arg[2], dst;
+
+   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+      dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
+         (util_float_to_half(arg[1].f[chan]) << 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
+      }
+   }
+}
+
+static void
+exec_up2h(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned chan;
+   union tgsi_exec_channel arg, dst[2];
+
+   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+      dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
+      dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
 static void
 exec_scs(struct tgsi_exec_machine *mach,
         const struct tgsi_full_instruction *inst)
@ -4339,7 +4379,7 @@ exec_instruction(
      break;

   case TGSI_OPCODE_PK2H:
-      assert (0);
+      exec_pk2h(mach, inst);
      break;

   case TGSI_OPCODE_PK2US:
@ -4425,7 +4465,7 @@ exec_instruction(
      break;

   case TGSI_OPCODE_UP2H:
-      assert (0);
+      exec_up2h(mach, inst);
      break;

   case TGSI_OPCODE_UP2US:
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@ -149,7 +149,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
-   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, OTHR, "MEMBAR", TGSI_OPCODE_MEMBAR },
   { 0, 1, 0, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
   { 0, 1, 0, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
   { 0, 1, 0, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
@ -426,6 +426,7 @@ tgsi_opcode_infer_src_type( uint opcode )
   case TGSI_OPCODE_SAMPLE_I:
   case TGSI_OPCODE_SAMPLE_I_MS:
   case TGSI_OPCODE_UMUL_HI:
+   case TGSI_OPCODE_UP2H:
      return TGSI_TYPE_UNSIGNED;
   case TGSI_OPCODE_IMUL_HI:
   case TGSI_OPCODE_I2F:
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@ -377,6 +377,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                     info->reads_position = TRUE;
                  else if (semName == TGSI_SEMANTIC_FACE)
                     info->uses_frontface = TRUE;
+                  else if (semName == TGSI_SEMANTIC_SAMPLEMASK)
+                     info->reads_samplemask = TRUE;
               }
               else if (file == TGSI_FILE_OUTPUT) {
                  info->output_semantic_name[reg] = (ubyte) semName;
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@ -81,6 +81,7 @@ struct tgsi_shader_info
   ubyte colors_written;
   boolean reads_position; /**< does fragment shader read position? */
   boolean reads_z; /**< does fragment shader read depth? */
+   boolean reads_samplemask; /**< does fragment shader read sample mask? */
   boolean writes_z;  /**< does fragment shader write Z value? */
   boolean writes_stencil; /**< does fragment shader write stencil value? */
   boolean writes_samplemask; /**< does fragment shader write sample mask? */
--- a/src/gallium/auxiliary/util/u_box.h
+++ b/src/gallium/auxiliary/util/u_box.h
@ -195,4 +195,16 @@ u_box_minify_2d(struct pipe_box *dst,
   dst->height = MAX2(src->height >> l, 1);
 }

+static inline void
+u_box_minify_3d(struct pipe_box *dst,
+                const struct pipe_box *src, unsigned l)
+{
+   dst->x = src->x >> l;
+   dst->y = src->y >> l;
+   dst->z = src->z >> l;
+   dst->width = MAX2(src->width >> l, 1);
+   dst->height = MAX2(src->height >> l, 1);
+   dst->depth = MAX2(src->depth >> l, 1);
+}
+
 #endif
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@ -52,7 +52,7 @@
 #include <machine/cpu.h>
 #endif

-#if defined(PIPE_OS_FREEBSD)
+#if defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_DRAGONFLY)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
--- a/src/gallium/auxiliary/util/u_format_parse.py
+++ b/src/gallium/auxiliary/util/u_format_parse.py
@ -313,7 +313,7 @@ def _parse_channels(fields, layout, colorspace, swizzles):
    return channels

 def parse(filename):
-    '''Parse the format descrition in CSV format in terms of the 
+    '''Parse the format description in CSV format in terms of the
    Channel and Format classes above.'''

    stream = open(filename)
--- a/src/gallium/auxiliary/util/u_half.h
+++ b/src/gallium/auxiliary/util/u_half.h
@ -74,7 +74,11 @@ util_float_to_half(float f)
      f32.ui &= round_mask;
      f32.f  *= magic.f;
      f32.ui -= round_mask;
-
+      /*
+       * XXX: The magic mul relies on denorms being available, otherwise
+       * all f16 denorms get flushed to zero - hence when this is used
+       * for tgsi_exec in softpipe we won't get f16 denorms.
+       */
      /*
       * Clamp to max finite value if overflowed.
       * OpenGL has completely undefined rounding behavior for float to
@ -112,6 +116,7 @@ util_half_to_float(uint16_t f16)

   /* Adjust */
   f32.f *= magic.f;
+   /* XXX: The magic mul relies on denorms being available */

   /* Inf / NaN */
   if (f32.f >= infnan.f)
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@ -49,6 +49,13 @@ enum VS_OUTPUT
   VS_O_VTEX = 0
 };

+const int vl_zscan_normal_16[] =
+{
+   /* Zig-Zag scan pattern */
+    0, 1, 4, 8, 5, 2, 3, 6,
+    9,12,13,10, 7,11,14,15
+};
+
 const int vl_zscan_linear[] =
 {
   /* Linear scan pattern */
--- a/src/gallium/auxiliary/vl/vl_zscan.h
+++ b/src/gallium/auxiliary/vl/vl_zscan.h
@ -64,6 +64,7 @@ struct vl_zscan_buffer
   struct pipe_surface *dst;
 };

+extern const int vl_zscan_normal_16[];
 extern const int vl_zscan_linear[];
 extern const int vl_zscan_normal[];
 extern const int vl_zscan_alternate[];
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@ -325,6 +325,11 @@ returned).  Otherwise, if the ``wait`` parameter is FALSE, the call
 will not block and the return value will be TRUE if the query has
 completed or FALSE otherwise.

+``get_query_result_resource`` is used to store the result of a query into
+a resource without synchronizing with the CPU. This write will optionally
+wait for the query to complete, and will optionally write whether the value
+is available instead of the value itself.
+
 The interface currently includes the following types of queries:

 ``PIPE_QUERY_OCCLUSION_COUNTER`` counts the number of fragments which
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@ -138,6 +138,10 @@ The integer capabilities:
 * ``PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT``: Describes the required
  alignment for pipe_sampler_view::u.buf.first_element, in bytes.
  If a driver does not support first/last_element, it should return 0.
+* ``PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY``: Whether the driver only
+  supports R, RG, RGB and RGBA formats for PIPE_BUFFER sampler views.
+  When this is the case it should be assumed that the swizzle parameters
+  in the sampler view have no effect.
 * ``PIPE_CAP_TGSI_TEXCOORD``: This CAP describes a hw limitation.
  If true, the hardware cannot replace arbitrary shader inputs with sprite
  coordinates and hence the inputs that are desired to be replaceable must
@ -164,7 +168,7 @@ The integer capabilities:
  view it is intended to be used with, or herein undefined results may occur
  for permutational swizzles.
 * ``PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE``: The maximum accessible size with
-  a buffer sampler view, in bytes.
+  a buffer sampler view, in texels.
 * ``PIPE_CAP_MAX_VIEWPORTS``: The maximum number of viewports (and scissors
  since they are linked) a driver can support. Returning 0 is equivalent
  to returning 1 because every driver has to support at least a single
@ -306,6 +310,15 @@ The integer capabilities:
 * ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
  is supported.
 * ``PIPE_CAP_STRING_MARKER``: Whether pipe->emit_string_marker() is supported.
+* ``PIPE_CAP_SURFACE_REINTERPRET_BLOCKS``: Indicates whether
+  pipe_context::create_surface supports reinterpreting a texture as a surface
+  of a format with different block width/height (but same block size in bits).
+  For example, a compressed texture image can be interpreted as a
+  non-compressed surface whose texels are the same number of bits as the
+  compressed blocks, and vice versa. The width and height of the surface is
+  adjusted appropriately.
+* ``PIPE_CAP_QUERY_BUFFER_OBJECT``: Driver supports
+  context::get_query_result_resource callback.


 .. _pipe_capf:
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@ -2372,6 +2372,23 @@ programs.
  the program.  Results are unspecified if any of the remaining
  threads terminates or never reaches an executed BARRIER instruction.

+.. opcode:: MEMBAR - Memory barrier
+
+  ``MEMBAR type``
+
+  This opcode waits for the completion of all memory accesses based on
+  the type passed in. The type is an immediate bitfield with the following
+  meaning:
+
+  Bit 0: Shader storage buffers
+  Bit 1: Atomic buffers
+  Bit 2: Images
+  Bit 3: Shared memory
+  Bit 4: Thread group
+
+  These may be passed in in any combination. An implementation is free to not
+  distinguish between these as it sees fit. However these map to all the
+  possibilities made available by GLSL.

 .. _atomopcodes:

--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@ -152,6 +152,9 @@ fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len)
 	struct fd_ringbuffer *ring = ctx->ring;
 	const uint32_t *buf = (const void *)string;

+	/* max packet size is 0x3fff dwords: */
+	len = MIN2(len, 0x3fff * 4);
+
 	OUT_PKT3(ring, CP_NOP, align(len, 4) / 4);
 	while (len >= 4) {
 		OUT_RING(ring, *buf);
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@ -165,6 +165,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_COMPUTE:
+	case PIPE_CAP_QUERY_MEMORY_INFO:
 		return 0;

 	case PIPE_CAP_SM3:
@ -183,6 +184,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLIP_HALFZ:
 		return is_a3xx(screen) || is_a4xx(screen);

+	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+		return 0;
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
 		if (is_a3xx(screen)) return 16;
 		if (is_a4xx(screen)) return 32;
@ -248,6 +251,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_INVALIDATE_BUFFER:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
 		return 0;

 	case PIPE_CAP_MAX_VIEWPORTS:
@ -296,6 +300,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	/* Queries. */
 	case PIPE_CAP_QUERY_TIME_ELAPSED:
 	case PIPE_CAP_QUERY_TIMESTAMP:
+	case PIPE_CAP_QUERY_BUFFER_OBJECT:
 		return 0;
 	case PIPE_CAP_OCCLUSION_QUERY:
 		return is_a3xx(screen) || is_a4xx(screen);
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@ -556,6 +556,10 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp)
 	}
 }

+/* NOTE: this creates the "TGSI" style fragface (ie. input slot
+ * VARYING_SLOT_FACE).  For NIR style nir_intrinsic_load_front_face
+ * we can just use the value from hw directly (since it is boolean)
+ */
 static struct ir3_instruction *
 create_frag_face(struct ir3_compile *ctx, unsigned comp)
 {
@ -1224,7 +1228,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_vertex_id_zero_base:
 		if (!ctx->vertex_id) {
-			ctx->vertex_id = create_input(ctx->block, 0);
+			ctx->vertex_id = create_input(b, 0);
 			add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
 					ctx->vertex_id);
 		}
@ -1232,7 +1236,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_instance_id:
 		if (!ctx->instance_id) {
-			ctx->instance_id = create_input(ctx->block, 0);
+			ctx->instance_id = create_input(b, 0);
 			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
 					ctx->instance_id);
 		}
@ -1244,6 +1248,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
 		}
 		break;
+	case nir_intrinsic_load_front_face:
+		if (!ctx->frag_face) {
+			ctx->so->frag_face = true;
+			ctx->frag_face = create_input(b, 0);
+			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+		}
+		dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard: {
 		struct ir3_instruction *cond, *kill;
@ -1349,6 +1361,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	struct ir3_block *b = ctx->block;
 	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
 	struct ir3_instruction **coord, *lod, *compare, *proj, **off, **ddx, **ddy;
+	struct ir3_instruction *const_off[4];
 	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
 	unsigned i, coords, flags;
 	unsigned nsrc0 = 0, nsrc1 = 0;
@ -1392,7 +1405,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 			ddy = get_src(ctx, &tex->src[i].src);
 			break;
 		default:
-			compile_error(ctx, "Unhandled NIR tex serc type: %d\n",
+			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
 					tex->src[i].src_type);
 			return;
 		}
@ -1417,6 +1430,21 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)

 	tex_info(tex, &flags, &coords);

+	if (!has_off) {
+		/* could still have a constant offset: */
+		if (tex->const_offset[0] || tex->const_offset[1] ||
+				tex->const_offset[2] || tex->const_offset[3]) {
+			off = const_off;
+
+			off[0] = create_immed(b, tex->const_offset[0]);
+			off[1] = create_immed(b, tex->const_offset[1]);
+			off[2] = create_immed(b, tex->const_offset[2]);
+			off[3] = create_immed(b, tex->const_offset[3]);
+
+			has_off = true;
+		}
+	}
+
 	/* scale up integer coords for TXF based on the LOD */
 	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
 		assert(has_lod);
@ -2053,6 +2081,9 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 		case VARYING_SLOT_CLIP_DIST0:
 		case VARYING_SLOT_CLIP_DIST1:
 			break;
+		case VARYING_SLOT_CLIP_VERTEX:
+			/* handled entirely in nir_lower_clip: */
+			return;
 		default:
 			if (slot >= VARYING_SLOT_VAR0)
 				break;
@ -2135,11 +2166,17 @@ emit_instructions(struct ir3_compile *ctx)
 		setup_output(ctx, var);
 	}

-	/* Setup variables (which should only be arrays): */
+	/* Setup global variables (which should only be arrays): */
 	nir_foreach_variable(var, &ctx->s->globals) {
 		declare_var(ctx, var);
 	}

+	/* Setup local variables (which should only be arrays): */
+	/* NOTE: need to do something more clever when we support >1 fxn */
+	nir_foreach_variable(var, &fxn->locals) {
+		declare_var(ctx, var);
+	}
+
 	/* And emit the body: */
 	ctx->impl = fxn;
 	emit_function(ctx, fxn);
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@ -262,6 +262,9 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
      return 0;

   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@ -428,6 +428,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_CUBE_MAP_ARRAY:
   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
      return true;
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+      return 0;
   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
      return 1;
   case PIPE_CAP_TGSI_TEXCOORD:
@ -486,6 +488,9 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@ -308,17 +308,4 @@ void
 lp_debug_draw_bins_by_coverage( struct lp_scene *scene );


-#ifdef PIPE_ARCH_SSE
-#include <emmintrin.h>
-#include "util/u_sse.h"
-
-static inline __m128i
-lp_plane_to_m128i(const struct lp_rast_plane *plane)
-{
-   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
-                         (int32_t)plane->dcdy, (int32_t)plane->eo);
-}
-
-#endif
-
 #endif
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)

 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
+                         const union lp_rast_cmd_arg arg)
 {
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
   unsigned nr = 0;

-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
   __m128i zero = _mm_setzero_si128();

-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
-   __m128i rej4;
-
-   __m128i dcdx2;
-   __m128i dcdx3;
+   __m128i c, dcdx, dcdy, rej4;
+   __m128i dcdx_neg_mask, dcdy_neg_mask;
+   __m128i dcdx2, dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;
-   
+
   transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &rej4);
+                    &c, &unused, &dcdx, &dcdy);
+
+   /* recalc eo - easier than trying to load as scalars / shuffle... */
+   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                        _mm_and_si128(dcdx_neg_mask, dcdx));

   /* Adjust dcdx;
    */
@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,

 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                     const union lp_rast_cmd_arg arg)
+                        const union lp_rast_cmd_arg arg)
 {
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
   unsigned y = (arg.triangle.plane_mask >> 8) + task->y;

-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
   __m128i zero = _mm_setzero_si128();

-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
+   __m128i c, dcdx, dcdy;
+   __m128i dcdx2, dcdx3;

-   __m128i dcdx2;
-   __m128i dcdx3;
-   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;

   transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &unused);
+                    &c, &unused, &dcdx, &dcdy);

   /* Adjust dcdx;
    */
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@ -311,6 +311,10 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@ -168,6 +168,21 @@ struct lp_setup_context
                     const float (*v2)[4]);
 };

+static inline void
+scissor_planes_needed(boolean scis_planes[4], struct u_rect *bbox,
+                      struct u_rect *scissor)
+{
+   /* left */
+   scis_planes[0] = (bbox->x0 < scissor->x0);
+   /* right */
+   scis_planes[1] = (bbox->x1 > scissor->x1);
+   /* top */
+   scis_planes[2] = (bbox->y0 < scissor->y0);
+   /* bottom */
+   scis_planes[3] = (bbox->y1 > scissor->y1);
+}
+
+
 void lp_setup_choose_triangle( struct lp_setup_context *setup );
 void lp_setup_choose_line( struct lp_setup_context *setup );
 void lp_setup_choose_point( struct lp_setup_context *setup );
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
      layer = MIN2(layer, scene->fb_max_layer);
   }

-   if (setup->scissor_test) {
-      nr_planes = 8;
-   }
-   else {
-      nr_planes = 4;
-   }
-
   dx = v1[0][0] - v2[0][0];
   dy = v1[0][1] - v2[0][1];
   area = (dx * dx  + dy * dy);
@ -591,6 +584,18 @@ try_setup_line( struct lp_setup_context *setup,
   bbox.x0 = MAX2(bbox.x0, 0);
   bbox.y0 = MAX2(bbox.y0, 0);

+   nr_planes = 4;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]);
+      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
+   }
+
   line = lp_setup_alloc_triangle(scene,
                                  key->num_inputs,
                                  nr_planes,
@ -708,30 +713,46 @@ try_setup_line( struct lp_setup_context *setup,
    * Note that otherwise, the scissor planes only vary in 'C' value,
    * and even then only on state-changes.  Could alternatively store
    * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
    */
-   if (nr_planes == 8) {
-      const struct u_rect *scissor =
-         &setup->scissors[viewport_index];
+   if (nr_planes > 4) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      struct lp_rast_plane *plane_s = &plane[4];
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, scissor);

-      plane[4].dcdx = -1 << 8;
-      plane[4].dcdy = 0;
-      plane[4].c = (1-scissor->x0) << 8;
-      plane[4].eo = 1 << 8;
-
-      plane[5].dcdx = 1 << 8;
-      plane[5].dcdy = 0;
-      plane[5].c = (scissor->x1+1) << 8;
-      plane[5].eo = 0;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = 1 << 8;
-      plane[6].c = (1-scissor->y0) << 8;
-      plane[6].eo = 1 << 8;
-
-      plane[7].dcdx = 0;
-      plane[7].dcdy = -1 << 8;
-      plane[7].c = (scissor->y1+1) << 8;
-      plane[7].eo = 0;
+      if (s_planes[0]) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[1]) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (s_planes[2]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[3]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
   }

   return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
      layer = MIN2(layer, scene->fb_max_layer);
   }

-   if (setup->scissor_test) {
-      nr_planes = 7;
-   }
-   else {
-      nr_planes = 3;
-   }
-
   /* Bounding rectangle (in pixels) */
   {
      /* Yes this is necessary to accurately calculate bounding boxes
@ -347,6 +340,18 @@ do_triangle_ccw(struct lp_setup_context *setup,
   bbox.x0 = MAX2(bbox.x0, 0);
   bbox.y0 = MAX2(bbox.y0, 0);

+   nr_planes = 3;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]);
+      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
+   }
+
   tri = lp_setup_alloc_triangle(scene,
                                 key->num_inputs,
                                 nr_planes,
@ -367,13 +372,11 @@ do_triangle_ccw(struct lp_setup_context *setup,

   /* Setup parameter interpolants:
    */
-   setup->setup.variant->jit_function( v0,
-				       v1,
-				       v2,
-				       frontfacing,
-				       GET_A0(&tri->inputs),
-				       GET_DADX(&tri->inputs),
-				       GET_DADY(&tri->inputs) );
+   setup->setup.variant->jit_function(v0, v1, v2,
+                                      frontfacing,
+                                      GET_A0(&tri->inputs),
+                                      GET_DADX(&tri->inputs),
+                                      GET_DADY(&tri->inputs));

   tri->inputs.frontfacing = frontfacing;
   tri->inputs.disable = FALSE;
@ -383,9 +386,9 @@ do_triangle_ccw(struct lp_setup_context *setup,

   if (0)
      lp_dump_setup_coef(&setup->setup.variant->key,
-			 (const float (*)[4])GET_A0(&tri->inputs),
-			 (const float (*)[4])GET_DADX(&tri->inputs),
-			 (const float (*)[4])GET_DADY(&tri->inputs));
+                         (const float (*)[4])GET_A0(&tri->inputs),
+                         (const float (*)[4])GET_DADX(&tri->inputs),
+                         (const float (*)[4])GET_DADY(&tri->inputs));

   plane = GET_PLANES(tri);

@ -672,29 +675,46 @@ do_triangle_ccw(struct lp_setup_context *setup,
    * Note that otherwise, the scissor planes only vary in 'C' value,
    * and even then only on state-changes.  Could alternatively store
    * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
    */
-   if (nr_planes == 7) {
-      const struct u_rect *scissor = &setup->scissors[viewport_index];
+   if (nr_planes > 3) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      struct lp_rast_plane *plane_s = &plane[3];
+      boolean s_planes[4];
+      scissor_planes_needed(s_planes, &bbox, scissor);

-      plane[3].dcdx = -1 << 8;
-      plane[3].dcdy = 0;
-      plane[3].c = (1-scissor->x0) << 8;
-      plane[3].eo = 1 << 8;
-
-      plane[4].dcdx = 1 << 8;
-      plane[4].dcdy = 0;
-      plane[4].c = (scissor->x1+1) << 8;
-      plane[4].eo = 0;
-
-      plane[5].dcdx = 0;
-      plane[5].dcdy = 1 << 8;
-      plane[5].c = (1-scissor->y0) << 8;
-      plane[5].eo = 1 << 8;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = -1 << 8;
-      plane[6].c = (scissor->y1+1) << 8;
-      plane[6].eo = 0;
+      if (s_planes[0]) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[1]) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (s_planes[2]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[3]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
   }

   return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
@ -984,17 +1004,16 @@ calc_fixed_position(struct lp_setup_context *setup,
    * Both should be acceptable, I think.
    */
 #if defined(PIPE_ARCH_SSE)
-   __m128d v0r, v1r, v2r;
+   __m128 v0r, v1r;
   __m128 vxy0xy2, vxy1xy0;
   __m128i vxy0xy2i, vxy1xy0i;
   __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
   __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
   __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
-   v0r = _mm_load_sd((const double *)v0[0]);
-   v1r = _mm_load_sd((const double *)v1[0]);
-   v2r = _mm_load_sd((const double *)v2[0]);
-   vxy0xy2 = _mm_castpd_ps(_mm_unpacklo_pd(v0r, v2r));
-   vxy1xy0 = _mm_castpd_ps(_mm_unpacklo_pd(v1r, v0r));
+   v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
+   vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
+   v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
+   vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
   vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
   vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
   vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@ -393,6 +393,9 @@ ImmediateValue::isInteger(const int i) const
   case TYPE_S32:
   case TYPE_U32:
      return reg.data.s32 == i; // as if ...
+   case TYPE_S64:
+   case TYPE_U64:
+      return reg.data.s64 == i; // as if ...
   case TYPE_F32:
      return reg.data.f32 == static_cast<float>(i);
   case TYPE_F64:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@ -132,6 +132,7 @@ enum operation
   OP_SUBFM,   // surface bitfield manipulation
   OP_SUCLAMP, // clamp surface coordinates
   OP_SUEAU,   // surface effective address
+   OP_SUQ,     // surface query
   OP_MADSP,   // special integer multiply-add
   OP_TEXBAR, // texture dependency barrier
   OP_DFDX,
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@ -1947,10 +1947,16 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
   case OP_CEIL:
   case OP_FLOOR:
   case OP_TRUNC:
-   case OP_CVT:
   case OP_SAT:
      emitCVT(insn);
      break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_PREDICATE ||
+          insn->src(0).getFile() == FILE_PREDICATE)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
   case OP_RSQ:
      emitSFnOp(insn, 5 + 2 * insn->subOp);
      break;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@ -673,7 +673,12 @@ CodeEmitterGM107::emitMOV()
       (insn->sType != TYPE_F32 && !longIMMD(insn->src(0)))) {
      switch (insn->src(0).getFile()) {
      case FILE_GPR:
-         emitInsn(0x5c980000);
+         if (insn->def(0).getFile() == FILE_PREDICATE) {
+            emitInsn(0x5b6a0000);
+            emitGPR (0x08);
+         } else {
+            emitInsn(0x5c980000);
+         }
         emitGPR (0x14, insn->src(0));
         break;
      case FILE_MEMORY_CONST:
@ -684,18 +689,32 @@ CodeEmitterGM107::emitMOV()
         emitInsn(0x38980000);
         emitIMMD(0x14, 19, insn->src(0));
         break;
+      case FILE_PREDICATE:
+         emitInsn(0x50880000);
+         emitPRED(0x0c, insn->src(0));
+         emitPRED(0x1d);
+         emitPRED(0x27);
+         break;
      default:
         assert(!"bad src file");
         break;
      }
-      emitField(0x27, 4, insn->lanes);
+      if (insn->def(0).getFile() != FILE_PREDICATE &&
+          insn->src(0).getFile() != FILE_PREDICATE)
+         emitField(0x27, 4, insn->lanes);
   } else {
      emitInsn (0x01000000);
      emitIMMD (0x14, 32, insn->src(0));
      emitField(0x0c, 4, insn->lanes);
   }

-   emitGPR(0x00, insn->def(0));
+   if (insn->def(0).getFile() == FILE_PREDICATE) {
+      emitPRED(0x27);
+      emitPRED(0x03, insn->def(0));
+      emitPRED(0x00);
+   } else {
+      emitGPR(0x00, insn->def(0));
+   }
 }

 void
@ -2684,11 +2703,7 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
      emitRAM();
      break;
   case OP_MOV:
-      if (insn->def(0).getFile() == FILE_GPR &&
-          insn->src(0).getFile() != FILE_PREDICATE)
-         emitMOV();
-      else
-         assert(!"R2P/P2R");
+      emitMOV();
      break;
   case OP_RDSV:
      emitS2R();
@ -2700,7 +2715,10 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
   case OP_CEIL:
   case OP_TRUNC:
   case OP_CVT:
-      if (isFloatType(insn->dType)) {
+      if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE ||
+                                 insn->src(0).getFile() == FILE_PREDICATE)) {
+         emitMOV();
+      } else if (isFloatType(insn->dType)) {
         if (isFloatType(insn->sType))
            emitF2F();
         else
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@ -2021,8 +2021,10 @@ CodeEmitterNVC0::emitATOM(const Instruction *i)
      code[0] |= 63 << 20;
   }

-   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
-      srcId(i->src(2), 32 + 17);
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      assert(i->src(1).getSize() == 2 * typeSizeof(i->sType));
+      code[1] |= (SDATA(i->src(1)).id + 1) << 17;
+   }
 }

 void
@ -2433,10 +2435,16 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
   case OP_CEIL:
   case OP_FLOOR:
   case OP_TRUNC:
-   case OP_CVT:
   case OP_SAT:
      emitCVT(insn);
      break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_PREDICATE ||
+          insn->src(0).getFile() == FILE_PREDICATE)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
   case OP_RSQ:
      emitSFnOp(insn, 5 + 2 * insn->subOp);
      break;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@ -38,6 +38,7 @@ static nv50_ir::operation translateOpcode(uint opcode);
 static nv50_ir::DataFile translateFile(uint file);
 static nv50_ir::TexTarget translateTexture(uint texTarg);
 static nv50_ir::SVSemantic translateSysVal(uint sysval);
+static nv50_ir::CacheMode translateCacheMode(uint qualifier);

 class Instruction
 {
@ -213,6 +214,12 @@ public:

   nv50_ir::TexInstruction::Target getTexture(const Source *, int s) const;

+   nv50_ir::CacheMode getCacheMode() const {
+      if (!insn->Instruction.Memory)
+         return nv50_ir::CACHE_CA;
+      return translateCacheMode(insn->Memory.Qualifier);
+   }
+
   inline uint getLabel() { return insn->Label.Label; }

   unsigned getSaturate() const { return insn->Instruction.Saturate; }
@ -366,7 +373,7 @@ static nv50_ir::DataFile translateFile(uint file)
   case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
   case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
   case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
-   //case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   case TGSI_FILE_BUFFER:          return nv50_ir::FILE_MEMORY_GLOBAL;
   case TGSI_FILE_SAMPLER:
   case TGSI_FILE_NULL:
   default:
@ -436,6 +443,15 @@ static nv50_ir::TexTarget translateTexture(uint tex)
   }
 }

+static nv50_ir::CacheMode translateCacheMode(uint qualifier)
+{
+   if (qualifier & TGSI_MEMORY_VOLATILE)
+      return nv50_ir::CACHE_CV;
+   if (qualifier & TGSI_MEMORY_COHERENT)
+      return nv50_ir::CACHE_CG;
+   return nv50_ir::CACHE_CA;
+}
+
 nv50_ir::DataType Instruction::inferSrcType() const
 {
   switch (getOpcode()) {
@ -1210,6 +1226,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
   case TGSI_FILE_IMMEDIATE:
   case TGSI_FILE_PREDICATE:
   case TGSI_FILE_SAMPLER:
+   case TGSI_FILE_BUFFER:
      break;
   default:
      ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File);
@ -1255,6 +1272,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
         if (insn.getDst(0).isIndirect(0))
            indirectTempArrays.insert(insn.getDst(0).getArrayId());
+      } else
+      if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+         info->io.globalAccess |= 0x2;
      }
   }

@ -1264,13 +1284,10 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
         if (src.isIndirect(0))
            indirectTempArrays.insert(src.getArrayId());
      } else
-/*
-      if (src.getFile() == TGSI_FILE_RESOURCE) {
-         if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
-            info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
+      if (src.getFile() == TGSI_FILE_BUFFER) {
+         info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
               0x1 : 0x2;
      } else
-*/
      if (src.getFile() == TGSI_FILE_OUTPUT) {
         if (src.isIndirect(0)) {
            // We don't know which one is accessed, just mark everything for
@ -1752,7 +1769,7 @@ Converter::acquireDst(int d, int c)
   int idx = dst.getIndex(0);
   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;

-   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
+   if (dst.isMasked(c) || f == TGSI_FILE_BUFFER)
      return NULL;

   if (dst.isIndirect(0) ||
@ -2222,6 +2239,28 @@ Converter::handleLOAD(Value *dst0[4])
   int c;
   std::vector<Value *> off, src, ldv, def;

+   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER) {
+      for (c = 0; c < 4; ++c) {
+         if (!dst0[c])
+            continue;
+
+         Value *off = fetchSrc(1, c);
+         Symbol *sym;
+         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE) {
+            off = NULL;
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, tgsi.getSrc(1).getValueU32(0, info) + 4 * c);
+         } else {
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, 4 * c);
+         }
+
+         Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
+         ld->cache = tgsi.getCacheMode();
+         if (tgsi.getSrc(0).isIndirect(0))
+            ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
+      }
+      return;
+   }
+
   getResourceCoords(off, r, 1);

   if (isResourceRaw(code, r)) {
@ -2298,6 +2337,30 @@ Converter::handleSTORE()
   int c;
   std::vector<Value *> off, src, dummy;

+   if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+      for (c = 0; c < 4; ++c) {
+         if (!(tgsi.getDst(0).getMask() & (1 << c)))
+            continue;
+
+         Symbol *sym;
+         Value *off;
+         if (tgsi.getSrc(0).getFile() == TGSI_FILE_IMMEDIATE) {
+            off = NULL;
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c,
+                          tgsi.getSrc(0).getValueU32(0, info) + 4 * c);
+         } else {
+            off = fetchSrc(0, 0);
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, 4 * c);
+         }
+
+         Instruction *st = mkStore(OP_STORE, TYPE_U32, sym, off, fetchSrc(1, c));
+         st->cache = tgsi.getCacheMode();
+         if (tgsi.getDst(0).isIndirect(0))
+            st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
+      }
+      return;
+   }
+
   getResourceCoords(off, r, 0);
   src = off;
   const int s = src.size();
@ -2359,6 +2422,37 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
   std::vector<Value *> defv;
   LValue *dst = getScratch();

+   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER) {
+      for (int c = 0; c < 4; ++c) {
+         if (!dst0[c])
+            continue;
+
+         Instruction *insn;
+         Value *off = fetchSrc(1, c), *off2 = NULL;
+         Value *sym;
+         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE)
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, tgsi.getSrc(1).getValueU32(c, info));
+         else
+            sym = makeSym(TGSI_FILE_BUFFER, r, -1, c, 0);
+         if (tgsi.getSrc(0).isIndirect(0))
+            off2 = fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0);
+         if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+            insn = mkOp3(OP_ATOM, ty, dst, sym, fetchSrc(2, c), fetchSrc(3, c));
+         else
+            insn = mkOp2(OP_ATOM, ty, dst, sym, fetchSrc(2, c));
+         if (tgsi.getSrc(1).getFile() != TGSI_FILE_IMMEDIATE)
+            insn->setIndirect(0, 0, off);
+         if (off2)
+            insn->setIndirect(0, 1, off2);
+         insn->subOp = subOp;
+      }
+      for (int c = 0; c < 4; ++c)
+         if (dst0[c])
+            dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+      return;
+   }
+
+
   getResourceCoords(srcv, r, 1);

   if (isResourceSpecial(r)) {
@ -3103,6 +3197,14 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
      geni->fixed = 1;
      geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
      break;
+   case TGSI_OPCODE_MEMBAR:
+      geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL);
+      geni->fixed = 1;
+      if (tgsi.getSrc(0).getValueU32(0, info) & TGSI_MEMBAR_THREAD_GROUP)
+         geni->subOp = NV50_IR_SUBOP_MEMBAR(M, CTA);
+      else
+         geni->subOp = NV50_IR_SUBOP_MEMBAR(M, GL);
+      break;
   case TGSI_OPCODE_ATOMUADD:
   case TGSI_OPCODE_ATOMXCHG:
   case TGSI_OPCODE_ATOMCAS:
@ -3115,6 +3217,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
   case TGSI_OPCODE_ATOMIMAX:
      handleATOM(dst0, dstTy, tgsi::opcodeToSubOp(tgsi.getOpcode()));
      break;
+   case TGSI_OPCODE_RESQ:
+      geni = mkOp1(OP_SUQ, TYPE_U32, dst0[0],
+                   makeSym(TGSI_FILE_BUFFER, tgsi.getSrc(0).getIndex(0), -1, 0, 0));
+      if (tgsi.getSrc(0).isIndirect(0))
+         geni->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
+      break;
   case TGSI_OPCODE_IBFE:
   case TGSI_OPCODE_UBFE:
      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@ -1022,11 +1022,22 @@ NVC0LoweringPass::handleTXLQ(TexInstruction *i)
   return true;
 }

+bool
+NVC0LoweringPass::handleSUQ(Instruction *suq)
+{
+   suq->op = OP_MOV;
+   suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1),
+                                  suq->getSrc(0)->reg.fileIndex * 16));
+   suq->setIndirect(0, 0, NULL);
+   suq->setIndirect(0, 1, NULL);
+   return true;
+}

 bool
 NVC0LoweringPass::handleATOM(Instruction *atom)
 {
   SVSemantic sv;
+   Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;

   switch (atom->src(0).getFile()) {
   case FILE_MEMORY_LOCAL:
@ -1037,16 +1048,22 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
      break;
   default:
      assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
+      base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
+      assert(base->reg.size == 8);
+      if (ptr)
+         base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
+      assert(base->reg.size == 8);
+      atom->setIndirect(0, 0, base);
      return true;
   }
-   Value *base =
+   base =
      bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
-   Value *ptr = atom->getIndirect(0, 0);

   atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
   atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   if (ptr)
      base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
+   atom->setIndirect(0, 1, NULL);
   atom->setIndirect(0, 0, base);

   return true;
@ -1069,7 +1086,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
         cctl->setPredicate(cas->cc, cas->getPredicate());
   }

-   if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
      // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
      // should be set to the high part of the double reg or bad things will
      // happen elsewhere in the universe.
@ -1079,6 +1096,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
      bld.setPosition(cas, false);
      bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
      cas->setSrc(1, dreg);
+      cas->setSrc(2, dreg);
   }

   return true;
@ -1093,6 +1111,32 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 }

+inline Value *
+NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   off += prog->driver->io.suInfoBase;
+
+   if (ptr)
+      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
+
+   return bld.
+      mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
+}
+
+inline Value *
+NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   off += prog->driver->io.suInfoBase;
+
+   if (ptr)
+      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
+
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
+}
+
 inline Value *
 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 {
@ -1786,6 +1830,7 @@ NVC0LoweringPass::visit(Instruction *i)
      return handleRDSV(i);
   case OP_WRSV:
      return handleWRSV(i);
+   case OP_STORE:
   case OP_LOAD:
      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
         if (prog->getType() == Program::TYPE_COMPUTE) {
@ -1820,6 +1865,26 @@ NVC0LoweringPass::visit(Instruction *i)
      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
         i->op = OP_VFETCH;
+      } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+         Value *ind = i->getIndirect(0, 1);
+         Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+         // XXX come up with a way not to do this for EVERY little access but
+         // rather to batch these up somehow. Unfortunately we've lost the
+         // information about the field width by the time we get here.
+         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+         Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+         Value *pred = new_LValue(func, FILE_PREDICATE);
+         if (i->src(0).isIndirect(0)) {
+            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+         }
+         i->setIndirect(0, 1, NULL);
+         i->setIndirect(0, 0, ptr);
+         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+         i->setPredicate(CC_NOT_P, pred);
+         if (i->defExists(0)) {
+            bld.mkMov(i->getDef(0), bld.mkImm(0));
+         }
      }
      break;
   case OP_ATOM:
@ -1838,6 +1903,9 @@ NVC0LoweringPass::visit(Instruction *i)
      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
         handleSurfaceOpNVE4(i->asTex());
      break;
+   case OP_SUQ:
+      handleSUQ(i);
+      break;
   default:
      break;
   }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@ -101,6 +101,7 @@ protected:
   bool handleTXQ(TexInstruction *);
   virtual bool handleManualTXD(TexInstruction *);
   bool handleTXLQ(TexInstruction *);
+   bool handleSUQ(Instruction *);
   bool handleATOM(Instruction *);
   bool handleCasExch(Instruction *, bool needCctl);
   void handleSurfaceOpNVE4(TexInstruction *);
@ -116,6 +117,8 @@ private:
   void readTessCoord(LValue *dst, int c);

   Value *loadResInfo32(Value *ptr, uint32_t off);
+   Value *loadResInfo64(Value *ptr, uint32_t off);
+   Value *loadResLength32(Value *ptr, uint32_t off);
   Value *loadMsInfo32(Value *ptr, uint32_t off);
   Value *loadTexHandle(Value *ptr, unsigned int slot);

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@ -336,6 +336,7 @@ private:
   void expr(Instruction *, ImmediateValue&, ImmediateValue&);
   void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
   void opnd(Instruction *, ImmediateValue&, int s);
+   void opnd3(Instruction *, ImmediateValue&);

   void unary(Instruction *, const ImmediateValue&);

@ -388,6 +389,8 @@ ConstantFolding::visit(BasicBlock *bb)
      else
      if (i->srcExists(1) && i->src(1).getImmediate(src1))
         opnd(i, src1, 1);
+      if (i->srcExists(2) && i->src(2).getImmediate(src2))
+         opnd3(i, src2);
   }
   return true;
 }
@ -872,6 +875,24 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
   }
 }

+void
+ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
+{
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA:
+      if (imm2.isInteger(0)) {
+         i->op = OP_MUL;
+         i->setSrc(2, NULL);
+         foldCount++;
+         return;
+      }
+      break;
+   default:
+      return;
+   }
+}
+
 void
 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 {
@ -1202,6 +1223,14 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
            i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
         }
         break;
+      case OP_SHR:
+         if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
+            bld.setPosition(i, false);
+            i->op = OP_AND;
+            i->setSrc(0, si->getSrc(0));
+            i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
+         }
+         break;
      case OP_MUL:
         int muls;
         if (isFloatType(si->dType))
@ -2504,6 +2533,12 @@ MemoryOpt::runOpt(BasicBlock *bb)
         }
      } else
      if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+         if (typeSizeof(ldst->dType) == 4 &&
+             ldst->src(1).getFile() == FILE_GPR &&
+             ldst->getSrc(1)->getInsn()->op == OP_NOP) {
+            delete_Instruction(prog, ldst);
+            continue;
+         }
         isLoad = false;
      } else {
         // TODO: maybe have all fixed ops act as barrier ?
@ -3015,7 +3050,7 @@ Instruction::isResultEqual(const Instruction *that) const
   if (that->srcExists(s))
      return false;

-   if (op == OP_LOAD || op == OP_VFETCH) {
+   if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
      switch (src(0).getFile()) {
      case FILE_MEMORY_CONST:
      case FILE_SHADER_INPUT:
@ -3046,6 +3081,8 @@ GlobalCSE::visit(BasicBlock *bb)
      ik = phi->getSrc(0)->getInsn();
      if (!ik)
         continue; // probably a function input
+      if (ik->defCount(0xff) > 1)
+         continue; // too painful to check if we can really push this forward
      for (s = 1; phi->srcExists(s); ++s) {
         if (phi->getSrc(s)->refCount() > 1)
            break;
@ -3179,10 +3216,10 @@ DeadCodeElim::buryAll(Program *prog)
 bool
 DeadCodeElim::visit(BasicBlock *bb)
 {
-   Instruction *next;
+   Instruction *prev;

-   for (Instruction *i = bb->getFirst(); i; i = next) {
-      next = i->next;
+   for (Instruction *i = bb->getExit(); i; i = prev) {
+      prev = i->prev;
      if (i->isDead()) {
         ++deadCount;
         delete_Instruction(prog, i);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@ -161,6 +161,7 @@ const char *operationStr[OP_LAST + 1] =
   "subfm",
   "suclamp",
   "sueau",
+   "suq",
   "madsp",
   "texbar",
   "dfdx",
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@ -1544,6 +1544,9 @@ GCRA::cleanup(const bool success)

   delete[] nodes;
   nodes = NULL;
+   hi.next = hi.prev = &hi;
+   lo[0].next = lo[0].prev = &lo[0];
+   lo[1].next = lo[1].prev = &lo[1];
 }

 Symbol *
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@ -46,7 +46,7 @@ const uint8_t Target::operationSrcNr[] =
   1, 1, 1,                // TEX, TXB, TXL,
   1, 1, 1, 1, 1, 1, 2,    // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP
   1, 1, 2, 2, 2, 2, 2,    // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA
-   3, 3, 3, 3,             // SUBFM, SUCLAMP, SUEAU, MADSP
+   3, 3, 3, 1, 3,          // SUBFM, SUCLAMP, SUEAU, SUQ, MADSP
   0,                      // TEXBAR
   1, 1,                   // DFDX, DFDY
   1, 2, 1, 2, 0, 0,       // RDSV, WRSV, PIXLD, QUADOP, QUADON, QUADPOP
@ -109,8 +109,8 @@ const OpClass Target::operationClass[] =
   // SULDB, SULDP, SUSTB, SUSTP; SUREDB, SUREDP, SULEA
   OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_ATOMIC, OPCLASS_SURFACE,
   OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_SURFACE,
-   // SUBFM, SUCLAMP, SUEAU, MADSP
-   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
+   // SUBFM, SUCLAMP, SUEAU, SUQ, MADSP
+   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
   // TEXBAR
   OPCLASS_OTHER,
   // DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@ -266,7 +266,9 @@ nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
   int i;

   for (i = 0; i < num_buffers; ++i) {
+#ifndef NDEBUG
      assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]);
+#endif
      memcpy(dec->bsp_ptr, data[i], num_bytes[i]);
      dec->bsp_ptr += num_bytes[i];
      str_bsp->w0[0] += num_bytes[i];
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@ -184,6 +184,10 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@ -369,7 +369,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
      NOUVEAU_ERR("shader translation failed: %i\n", ret);
      goto out;
   }
-   FREE(info->bin.syms);

   prog->code = info->bin.code;
   prog->code_size = info->bin.codeSize;
@ -403,10 +402,13 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
         break;
      }
      prog->gp.vert_count = info->prop.gp.maxVertices;
-   } else
+   }
+
   if (prog->type == PIPE_SHADER_COMPUTE) {
      prog->cp.syms = info->bin.syms;
      prog->cp.num_syms = info->bin.numSyms;
+   } else {
+      FREE(info->bin.syms);
   }

   if (prog->pipe.stream_output.num_outputs)
@ -507,6 +509,9 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
   FREE(p->interps);
   FREE(p->so);

+   if (type == PIPE_SHADER_COMPUTE)
+      FREE(p->cp.syms);
+
   memset(p, 0, sizeof(*p));

   p->pipe = pipe;
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@ -227,6 +227,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@ -594,6 +594,82 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
   PUSH_DATA (push, nv50->rt_array_mode);
 }

+static void
+nv50_clear_buffer_push(struct pipe_context *pipe,
+                       struct pipe_resource *res,
+                       unsigned offset, unsigned size,
+                       const void *data, int data_size)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(res);
+   unsigned count = (size + 3) / 4;
+   unsigned xcoord = offset & 0xff;
+   unsigned tmp, i;
+
+   if (data_size == 1) {
+      tmp = *(unsigned char *)data;
+      tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
+      data = &tmp;
+      data_size = 4;
+   } else if (data_size == 2) {
+      tmp = *(unsigned short *)data;
+      tmp = (tmp << 16) | tmp;
+      data = &tmp;
+      data_size = 4;
+   }
+
+   unsigned data_words = data_size / 4;
+
+   nouveau_bufctx_refn(nv50->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nv50->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   offset &= ~0xff;
+
+   BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_2D(DST_PITCH), 5);
+   PUSH_DATA (push, 262144);
+   PUSH_DATA (push, 65536);
+   PUSH_DATA (push, 1);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
+   BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10);
+   PUSH_DATA (push, size);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, xcoord);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
+
+      BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx, 0);
+}
+
 static void
 nv50_clear_buffer(struct pipe_context *pipe,
                  struct pipe_resource *res,
@ -643,9 +719,22 @@ nv50_clear_buffer(struct pipe_context *pipe,

   assert(size % data_size == 0);

+   if (offset & 0xff) {
+      unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
+      assert(fixup_size % data_size == 0);
+      nv50_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
+      offset += fixup_size;
+      size -= fixup_size;
+      if (!size)
+         return;
+   }
+
   elements = size / data_size;
   height = (elements + 8191) / 8192;
   width = elements / height;
+   if (height > 1)
+      width &= ~0xff;
+   assert(width > 0);

   BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4);
   PUSH_DATAf(push, color.f[0]);
@ -669,13 +758,13 @@ nv50_clear_buffer(struct pipe_context *pipe,
   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
   PUSH_DATA (push, 1);
   BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5);
-   PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
-   PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
   PUSH_DATA (push, nv50_format_table[dst_fmt].rt);
   PUSH_DATA (push, 0);
   PUSH_DATA (push, 0);
   BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
-   PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
+   PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | align(width * data_size, 0x100));
   PUSH_DATA (push, height);
   BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
   PUSH_DATA (push, 0);
@ -694,25 +783,20 @@ nv50_clear_buffer(struct pipe_context *pipe,
   BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
   PUSH_DATA (push, 0x3c);

-   if (width * height != elements) {
-      offset += width * height * data_size;
-      width = elements - width * height;
-      height = 1;
-      BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 2);
-      PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
-      PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
-      BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
-      PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
-      PUSH_DATA (push, height);
-      BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
-      PUSH_DATA (push, 0x3c);
-   }
-
   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
   PUSH_DATA (push, nv50->cond_condmode);

-   nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
-   nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   if (buf->mm) {
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   if (width * height != elements) {
+      offset += width * height * data_size;
+      width = elements - width * height;
+      nv50_clear_buffer_push(pipe, res, offset, width * data_size,
+                             data, data_size);
+   }

   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@ -491,3 +491,52 @@ daic_runout:
 daic_runout_check:
   branz annul $r7 #daic_runout
   bra annul #daic_restore
+
+/* NVC0_3D_MACRO_QUERY_BUFFER_WRITE:
+ *
+ * This is a combination macro for all of our query buffer object needs.
+ * It has the option to clamp results to a configurable amount, as well as
+ * to write out one or two words.
+ *
+ * We use the query engine to write out the values, and expect the query
+ * address to point to the right place.
+ *
+ * arg = clamp value (0 means unclamped). clamped means just 1 written value.
+ * parm[0] = LSB of end value
+ * parm[1] = MSB of end value
+ * parm[2] = LSB of start value
+ * parm[3] = MSB of start value
+ * parm[4] = desired sequence
+ * parm[5] = actual sequence
+ */
+.section #mme9097_query_buffer_write
+   parm $r2
+   parm $r3
+   parm $r4
+   parm $r5 maddr 0x16c2 /* QUERY_SEQUENCE */
+   parm $r6
+   parm $r7
+   mov $r6 (sub $r7 $r6) /* actual - desired */
+   mov $r6 (sbb 0x0 0x0) /* if there was underflow, not reached yet */
+   braz annul $r6 #qbw_ready
+   exit
+qbw_ready:
+   mov $r2 (sub $r2 $r4)
+   braz $r1 #qbw_postclamp
+   mov $r3 (sbb $r3 $r5)
+   branz annul $r3 #qbw_clamp
+   mov $r4 (sub $r1 $r2)
+   mov $r4 (sbb 0x0 0x0)
+   braz annul $r4 #qbw_postclamp
+qbw_clamp:
+   mov $r2 $r1
+qbw_postclamp:
+   send $r2
+   mov $r4 0x1000
+   branz annul $r1 #qbw_done
+   send (extrinsrt 0x0 $r4 0x0 0x10 0x10)
+   maddr 0x16c2 /* QUERY_SEQUENCE */
+   send $r3
+qbw_done:
+   exit send (extrinsrt 0x0 $r4 0x0 0x10 0x10)
+   nop
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@ -332,3 +332,36 @@ uint32_t mme9097_draw_arrays_indirect_count[] = {
 	0xfffef837,
 	0xfffdc027,
 };
+
+uint32_t mme9097_query_buffer_write[] = {
+	0x00000201,
+	0x00000301,
+/* 0x000a: qbw_ready */
+	0x00000401,
+	0x05b08551,
+/* 0x0011: qbw_clamp */
+/* 0x0012: qbw_postclamp */
+	0x00000601,
+	0x00000701,
+/* 0x0018: qbw_done */
+	0x0005be10,
+	0x00060610,
+	0x0000b027,
+	0x00000091,
+	0x00051210,
+	0x0001c807,
+	0x00075b10,
+	0x00011837,
+	0x00048c10,
+	0x00060410,
+	0x0000a027,
+	0x00000a11,
+	0x00001041,
+	0x04000411,
+	0x00010837,
+	0x84010042,
+	0x05b08021,
+	0x00001841,
+	0x840100c2,
+	0x00000011,
+};
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@ -56,6 +56,7 @@ static void
 nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
 {
   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   int i, s;

   if (flags & PIPE_BARRIER_MAPPED_BUFFER) {
@ -90,6 +91,9 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
         }
      }
   }
+   if (flags & PIPE_BARRIER_SHADER_BUFFER) {
+      IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
+   }
 }

 static void
@ -122,6 +126,10 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
         pipe_surface_reference(&nvc0->surfaces[s][i], NULL);
   }

+   for (s = 0; s < 6; ++s)
+      for (i = 0; i < NVC0_MAX_BUFFERS; ++i)
+         pipe_resource_reference(&nvc0->buffers[s][i].buffer, NULL);
+
   for (i = 0; i < nvc0->num_tfbbufs; ++i)
      pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);

@ -180,10 +188,9 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
-   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (bind & PIPE_BIND_RENDER_TARGET) {
+   if (res->bind & PIPE_BIND_RENDER_TARGET) {
      for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
         if (nvc0->framebuffer.cbufs[i] &&
             nvc0->framebuffer.cbufs[i]->texture == res) {
@ -194,7 +201,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nvc0->framebuffer.zsbuf &&
          nvc0->framebuffer.zsbuf->texture == res) {
         nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@ -204,12 +211,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (bind & (PIPE_BIND_VERTEX_BUFFER |
-               PIPE_BIND_INDEX_BUFFER |
-               PIPE_BIND_CONSTANT_BUFFER |
-               PIPE_BIND_STREAM_OUTPUT |
-               PIPE_BIND_COMMAND_ARGS_BUFFER |
-               PIPE_BIND_SAMPLER_VIEW)) {
+   if (res->target == PIPE_BUFFER) {
      for (i = 0; i < nvc0->num_vtxbufs; ++i) {
         if (nvc0->vtxbuf[i].buffer == res) {
            nvc0->dirty |= NVC0_NEW_ARRAYS;
@ -253,6 +255,18 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
      }
+
+      for (s = 0; s < 5; ++s) {
+      for (i = 0; i < NVC0_MAX_BUFFERS; ++i) {
+         if (nvc0->buffers[s][i].buffer == res) {
+            nvc0->buffers_dirty[s] |= 1 << i;
+            nvc0->dirty |= NVC0_NEW_BUFFERS;
+            nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_BUF);
+            if (!--ref)
+               return ref;
+         }
+      }
+      }
   }

   return ref;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@ -56,6 +56,7 @@
 #define NVC0_NEW_SURFACES     (1 << 23)
 #define NVC0_NEW_MIN_SAMPLES  (1 << 24)
 #define NVC0_NEW_TESSFACTOR   (1 << 25)
+#define NVC0_NEW_BUFFERS      (1 << 26)

 #define NVC0_NEW_CP_PROGRAM   (1 << 0)
 #define NVC0_NEW_CP_SURFACES  (1 << 1)
@ -73,9 +74,10 @@
 #define NVC0_BIND_CB(s, i)   (164 + 16 * (s) + (i))
 #define NVC0_BIND_TFB         244
 #define NVC0_BIND_SUF         245
-#define NVC0_BIND_SCREEN      246
-#define NVC0_BIND_TLS         247
-#define NVC0_BIND_3D_COUNT    248
+#define NVC0_BIND_BUF         246
+#define NVC0_BIND_SCREEN      247
+#define NVC0_BIND_TLS         249
+#define NVC0_BIND_3D_COUNT    250

 /* compute bufctx (during launch_grid) */
 #define NVC0_BIND_CP_CB(i)     (  0 + (i))
@ -187,10 +189,15 @@ struct nvc0_context {

   struct nvc0_blitctx *blit;

+   /* NOTE: some of these surfaces may reference buffers */
   struct pipe_surface *surfaces[2][NVC0_MAX_SURFACE_SLOTS];
   uint16_t surfaces_dirty[2];
   uint16_t surfaces_valid[2];

+   struct pipe_shader_buffer buffers[6][NVC0_MAX_BUFFERS];
+   uint32_t buffers_dirty[6];
+   uint32_t buffers_valid[6];
+
   struct util_dynarray global_residents;
 };

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@ -33,4 +33,6 @@

 #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT		0x00003850

+#define NVC0_3D_MACRO_QUERY_BUFFER_WRITE			0x00003858
+
 #endif /* __NVC0_MACROS_H__ */
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@ -554,6 +554,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
      }
      info->io.resInfoCBSlot = 15;
      info->io.sampleInfoBase = 256 + 128;
+      info->io.suInfoBase = 512;
      info->io.msInfoCBSlot = 15;
      info->io.msInfoBase = 0; /* TODO */
   }
@ -635,6 +636,8 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
   }
   */
   if (info->io.globalAccess)
+      prog->hdr[0] |= 1 << 26;
+   if (info->io.globalAccess & 0x2)
      prog->hdr[0] |= 1 << 16;
   if (info->io.fp64)
      prog->hdr[0] |= 1 << 27;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@ -73,6 +73,24 @@ nvc0_get_query_result(struct pipe_context *pipe, struct pipe_query *pq,
   return q->funcs->get_query_result(nvc0_context(pipe), q, wait, result);
 }

+static void
+nvc0_get_query_result_resource(struct pipe_context *pipe,
+                               struct pipe_query *pq,
+                               boolean wait,
+                               enum pipe_query_value_type result_type,
+                               int index,
+                               struct pipe_resource *resource,
+                               unsigned offset)
+{
+   struct nvc0_query *q = nvc0_query(pq);
+   if (!q->funcs->get_query_result_resource) {
+      assert(!"Unexpected lack of get_query_result_resource");
+      return;
+   }
+   q->funcs->get_query_result_resource(nvc0_context(pipe), q, wait, result_type,
+                                       index, resource, offset);
+}
+
 static void
 nvc0_render_condition(struct pipe_context *pipe,
                      struct pipe_query *pq,
@ -129,7 +147,7 @@ nvc0_render_condition(struct pipe_context *pipe,
   }

   if (wait)
-      nvc0_hw_query_fifo_wait(push, q);
+      nvc0_hw_query_fifo_wait(nvc0, q);

   PUSH_SPACE(push, 7);
   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
@ -262,6 +280,7 @@ nvc0_init_query_functions(struct nvc0_context *nvc0)
   pipe->begin_query = nvc0_begin_query;
   pipe->end_query = nvc0_end_query;
   pipe->get_query_result = nvc0_get_query_result;
+   pipe->get_query_result_resource = nvc0_get_query_result_resource;
   pipe->render_condition = nvc0_render_condition;
   nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS;
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
@ -14,6 +14,13 @@ struct nvc0_query_funcs {
   void (*end_query)(struct nvc0_context *, struct nvc0_query *);
   boolean (*get_query_result)(struct nvc0_context *, struct nvc0_query *,
                               boolean, union pipe_query_result *);
+   void (*get_query_result_resource)(struct nvc0_context *nvc0,
+                                     struct nvc0_query *q,
+                                     boolean wait,
+                                     enum pipe_query_value_type result_type,
+                                     int index,
+                                     struct pipe_resource *resource,
+                                     unsigned offset);
 };

 struct nvc0_query {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@ -358,11 +358,119 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
   return true;
 }

+static void
+nvc0_hw_get_query_result_resource(struct nvc0_context *nvc0,
+                                  struct nvc0_query *q,
+                                  boolean wait,
+                                  enum pipe_query_value_type result_type,
+                                  int index,
+                                  struct pipe_resource *resource,
+                                  unsigned offset)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+   struct nv04_resource *buf = nv04_resource(resource);
+   unsigned stride;
+
+   assert(!hq->funcs || !hq->funcs->get_query_result);
+
+   if (index == -1) {
+      /* TODO: Use a macro to write the availability of the query */
+      if (hq->state != NVC0_HW_QUERY_STATE_READY)
+         nvc0_hw_query_update(nvc0->screen->base.client, q);
+      uint32_t ready[2] = {hq->state == NVC0_HW_QUERY_STATE_READY};
+      nvc0->base.push_cb(&nvc0->base, buf, offset,
+                         result_type >= PIPE_QUERY_TYPE_I64 ? 2 : 1,
+                         ready);
+      return;
+   }
+
+   /* If the fence guarding this query has not been emitted, that makes a lot
+    * of the following logic more complicated.
+    */
+   if (hq->is64bit && hq->fence->state < NOUVEAU_FENCE_STATE_EMITTED)
+      nouveau_fence_emit(hq->fence);
+
+   /* We either need to compute a 32- or 64-bit difference between 2 values,
+    * and then store the result as either a 32- or 64-bit value. As such let's
+    * treat all inputs as 64-bit (and just push an extra 0 for the 32-bit
+    * ones), and have one macro that clamps result to i32, u32, or just
+    * outputs the difference (no need to worry about 64-bit clamping).
+    */
+   if (hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_update(nvc0->screen->base.client, q);
+
+   if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
+      nvc0_hw_query_fifo_wait(nvc0, q);
+
+   nouveau_pushbuf_space(push, 16, 2, 0);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   PUSH_REFN (push, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
+   BEGIN_1IC0(push, NVC0_3D(MACRO_QUERY_BUFFER_WRITE), 7);
+   if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE) /* XXX what if 64-bit? */
+      PUSH_DATA(push, 0x00000001);
+   else if (result_type == PIPE_QUERY_TYPE_I32)
+      PUSH_DATA(push, 0x7fffffff);
+   else if (result_type == PIPE_QUERY_TYPE_U32)
+      PUSH_DATA(push, 0xffffffff);
+   else
+      PUSH_DATA(push, 0x00000000);
+
+   switch (q->type) {
+   case PIPE_QUERY_SO_STATISTICS:
+      stride = 2;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      stride = 12;
+      break;
+   default:
+      assert(index == 0);
+      stride = 1;
+      break;
+   }
+
+   if (hq->is64bit) {
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 * index,
+                           8 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 * (index + stride),
+                           8 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   } else {
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 4,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      PUSH_DATA(push, 0);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset + 16 + 4,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+      PUSH_DATA(push, 0);
+   }
+
+   if (wait || hq->state == NVC0_HW_QUERY_STATE_READY) {
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+   } else if (hq->is64bit) {
+      PUSH_DATA(push, hq->fence->sequence);
+      nouveau_pushbuf_data(push, nvc0->screen->fence.bo, 0,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   } else {
+      PUSH_DATA(push, hq->sequence);
+      nouveau_pushbuf_data(push, hq->bo, hq->offset,
+                           4 | NVC0_IB_ENTRY_1_NO_PREFETCH);
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+}
+
 static const struct nvc0_query_funcs hw_query_funcs = {
   .destroy_query = nvc0_hw_destroy_query,
   .begin_query = nvc0_hw_begin_query,
   .end_query = nvc0_hw_end_query,
   .get_query_result = nvc0_hw_get_query_result,
+   .get_query_result_resource = nvc0_hw_get_query_result_resource,
 };

 struct nvc0_query *
@ -476,8 +584,9 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 }

 void
-nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
+nvc0_hw_query_fifo_wait(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   struct nvc0_hw_query *hq = nvc0_hw_query(q);
   unsigned offset = hq->offset;

@ -486,9 +595,15 @@ nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
   PUSH_SPACE(push, 5);
   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
   BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, hq->bo->offset + offset);
-   PUSH_DATA (push, hq->bo->offset + offset);
-   PUSH_DATA (push, hq->sequence);
+   if (hq->is64bit) {
+      PUSH_DATAh(push, nvc0->screen->fence.bo->offset);
+      PUSH_DATA (push, nvc0->screen->fence.bo->offset);
+      PUSH_DATA (push, hq->fence->sequence);
+   } else {
+      PUSH_DATAh(push, hq->bo->offset + offset);
+      PUSH_DATA (push, hq->bo->offset + offset);
+      PUSH_DATA (push, hq->sequence);
+   }
   PUSH_DATA (push, (1 << 12) |
              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
@ -51,6 +51,6 @@ void
 nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *,
                             unsigned);
 void
-nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *);
+nvc0_hw_query_fifo_wait(struct nvc0_context *, struct nvc0_query *);

 #endif
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@ -111,6 +111,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 256;
   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
      return 1; /* 256 for binding as RT, but that's not possible in GL */
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+      return 16;
   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
      return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
   case PIPE_CAP_MAX_VIEWPORTS:
@ -189,6 +191,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_MULTI_DRAW_INDIRECT:
   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
      return 1;
   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@ -212,10 +215,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
      return 0;

   case PIPE_CAP_VENDOR_ID:
@ -322,8 +327,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
-   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
      return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+      return NVC0_MAX_BUFFERS;
   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
      return 16; /* would be 32 in linked (OpenGL-style) mode */
   case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
@ -676,8 +682,9 @@ nvc0_screen_create(struct nouveau_device *dev)
   push->rsvd_kick = 5;

   screen->base.vidmem_bindings |= PIPE_BIND_CONSTANT_BUFFER |
+      PIPE_BIND_SHADER_BUFFER |
      PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER |
-      PIPE_BIND_COMMAND_ARGS_BUFFER;
+      PIPE_BIND_COMMAND_ARGS_BUFFER | PIPE_BIND_QUERY_BUFFER;
   screen->base.sysmem_bindings |=
      PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;

@ -891,9 +898,9 @@ nvc0_screen_create(struct nouveau_device *dev)
      /* TIC and TSC entries for each unit (nve4+ only) */
      /* auxiliary constants (6 user clip planes, base instance id) */
      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, 512);
-      PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (i << 9));
-      PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (i << 9));
+      PUSH_DATA (push, 1024);
+      PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (i << 10));
+      PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (i << 10));
      BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
      PUSH_DATA (push, (15 << 4) | 1);
      if (screen->eng3d->oclass >= NVE4_3D_CLASS) {
@ -913,8 +920,8 @@ nvc0_screen_create(struct nouveau_device *dev)
   /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */
   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
   PUSH_DATA (push, 256);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
-   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
+   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
   BEGIN_1IC0(push, NVC0_3D(CB_POS), 5);
   PUSH_DATA (push, 0);
   PUSH_DATAf(push, 0.0f);
@ -922,8 +929,8 @@ nvc0_screen_create(struct nouveau_device *dev)
   PUSH_DATAf(push, 0.0f);
   PUSH_DATAf(push, 0.0f);
   BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
-   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));
+   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 10));

   if (screen->base.drm->version >= 0x01000101) {
      ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
@ -953,8 +960,12 @@ nvc0_screen_create(struct nouveau_device *dev)
   PUSH_DATA (push, screen->tls->size);
   BEGIN_NVC0(push, NVC0_3D(WARP_TEMP_ALLOC), 1);
   PUSH_DATA (push, 0);
+   /* Reduce likelihood of collision with real buffers by placing the hole at
+    * the top of the 4G area. This will have to be dealt with for real
+    * eventually by blocking off that area from the VM.
+    */
   BEGIN_NVC0(push, NVC0_3D(LOCAL_BASE), 1);
-   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0xff << 24);

   if (screen->eng3d->oclass < GM107_3D_CLASS) {
      ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL,
@ -1039,6 +1050,7 @@ nvc0_screen_create(struct nouveau_device *dev)
   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
+   MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);

   BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
   PUSH_DATA (push, 1);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@ -22,6 +22,8 @@

 #define NVC0_MAX_VIEWPORTS 16

+#define NVC0_MAX_BUFFERS 32
+

 struct nvc0_context;

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@ -316,7 +316,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
         continue;

      if (!targ->clean)
-         nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+         nvc0_hw_query_fifo_wait(nvc0, nvc0_query(targ->pq));
      nouveau_pushbuf_space(push, 0, 0, 1);
      BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
      PUSH_DATA (push, 1);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@ -1243,11 +1243,50 @@ nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader,
                       unsigned start_slot, unsigned count,
                       struct pipe_image_view **views)
 {
-#if 0
-   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views);
+}

-   nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
-#endif
+static void
+nvc0_bind_buffers_range(struct nvc0_context *nvc0, const unsigned t,
+                         unsigned start, unsigned nr,
+                         struct pipe_shader_buffer *pbuffers)
+{
+   const unsigned end = start + nr;
+   const unsigned mask = ((1 << nr) - 1) << start;
+   unsigned i;
+
+   assert(t < 5);
+
+   if (pbuffers) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (pbuffers[p].buffer)
+            nvc0->buffers_valid[t] |= (1 << i);
+         else
+            nvc0->buffers_valid[t] &= ~(1 << i);
+         nvc0->buffers[t][i].buffer_offset = pbuffers[p].buffer_offset;
+         nvc0->buffers[t][i].buffer_size = pbuffers[p].buffer_size;
+         pipe_resource_reference(&nvc0->buffers[t][i].buffer, pbuffers[p].buffer);
+      }
+   } else {
+      for (i = start; i < end; ++i)
+         pipe_resource_reference(&nvc0->buffers[t][i].buffer, NULL);
+      nvc0->buffers_valid[t] &= ~mask;
+   }
+   nvc0->buffers_dirty[t] |= mask;
+
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_BUF);
+}
+
+static void
+nvc0_set_shader_buffers(struct pipe_context *pipe,
+                        unsigned shader,
+                        unsigned start, unsigned nr,
+                        struct pipe_shader_buffer *buffers)
+{
+   const unsigned s = nvc0_shader_stage(shader);
+   nvc0_bind_buffers_range(nvc0_context(pipe), s, start, nr, buffers);
+
+   nvc0_context(pipe)->dirty |= NVC0_NEW_BUFFERS;
 }

 static inline void
@ -1377,6 +1416,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
   pipe->set_global_binding = nvc0_set_global_bindings;
   pipe->set_compute_resources = nvc0_set_compute_resources;
   pipe->set_shader_images = nvc0_set_shader_images;
+   pipe->set_shader_buffers = nvc0_set_shader_buffers;

   nvc0->sample_mask = ~0;
   nvc0->min_samples = 1;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@ -183,9 +183,9 @@ nvc0_validate_fb(struct nvc0_context *nvc0)

    ms = 1 << ms_mode;
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-    PUSH_DATA (push, 512);
-    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 9));
-    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 9));
+    PUSH_DATA (push, 1024);
+    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 10));
+    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (4 << 10));
    BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms);
    PUSH_DATA (push, 256 + 128);
    for (i = 0; i < ms; i++) {
@ -317,9 +317,9 @@ nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
   struct nouveau_bo *bo = nvc0->screen->uniform_bo;

   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-   PUSH_DATA (push, 512);
-   PUSH_DATAh(push, bo->offset + (5 << 16) + (s << 9));
-   PUSH_DATA (push, bo->offset + (5 << 16) + (s << 9));
+   PUSH_DATA (push, 1024);
+   PUSH_DATAh(push, bo->offset + (5 << 16) + (s << 10));
+   PUSH_DATA (push, bo->offset + (5 << 16) + (s << 10));
   BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1);
   PUSH_DATA (push, 256);
   PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
@ -470,6 +470,39 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
   }
 }

+static void
+nvc0_validate_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   int i, s;
+
+   for (s = 0; s < 5; s++) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 1024);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (s << 10));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (s << 10));
+      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
+      PUSH_DATA (push, 512);
+      for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+         if (nvc0->buffers[s][i].buffer) {
+            struct nv04_resource *res =
+               nv04_resource(nvc0->buffers[s][i].buffer);
+            PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+            PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+            PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+            PUSH_DATA (push, 0);
+            BCTX_REFN(nvc0->bufctx_3d, BUF, res, RDWR);
+         } else {
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+         }
+      }
+   }
+
+}
+
 static void
 nvc0_validate_sample_mask(struct nvc0_context *nvc0)
 {
@ -663,6 +696,7 @@ static struct state_validate {
    { nve4_set_tex_handles,        NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS },
    { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS },
    { nvc0_validate_surfaces,      NVC0_NEW_SURFACES },
+    { nvc0_validate_buffers,       NVC0_NEW_BUFFERS },
    { nvc0_idxbuf_validate,        NVC0_NEW_IDXBUF },
    { nvc0_tfb_validate,           NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG },
    { nvc0_validate_min_samples,   NVC0_NEW_MIN_SAMPLES },
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@ -357,27 +357,132 @@ nvc0_clear_render_target(struct pipe_context *pipe,
 }

 static void
-nvc0_clear_buffer_cpu(struct pipe_context *pipe,
-                      struct pipe_resource *res,
-                      unsigned offset, unsigned size,
-                      const void *data, int data_size)
+nvc0_clear_buffer_push_nvc0(struct pipe_context *pipe,
+                            struct pipe_resource *res,
+                            unsigned offset, unsigned size,
+                            const void *data, int data_size)
 {
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   struct nv04_resource *buf = nv04_resource(res);
-   struct pipe_transfer *pt;
-   struct pipe_box box;
-   unsigned elements, i;
+   unsigned i;

-   elements = size / data_size;
+   nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);

-   u_box_1d(offset, size, &box);
+   unsigned count = (size + 3) / 4;
+   unsigned data_words = data_size / 4;

-   uint8_t *map = buf->vtbl->transfer_map(pipe, res, 0, PIPE_TRANSFER_WRITE,
-                                          &box, &pt);
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;

-   for (i = 0; i < elements; ++i)
-      memcpy(&map[i*data_size], data, data_size);
+      if (!PUSH_SPACE(push, nr + 9))
+         break;

-   buf->vtbl->transfer_unmap(pipe, pt);
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+      PUSH_DATA (push, 0x100111);
+
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_NIC0(push, NVC0_M2MF(DATA), nr);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_buffer_push_nve4(struct pipe_context *pipe,
+                            struct pipe_resource *res,
+                            unsigned offset, unsigned size,
+                            const void *data, int data_size)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(res);
+   unsigned i;
+
+   nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   unsigned count = (size + 3) / 4;
+   unsigned data_words = data_size / 4;
+
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
+
+      if (!PUSH_SPACE(push, nr + 10))
+         break;
+
+      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), nr + 1);
+      PUSH_DATA (push, 0x1001);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_buffer_push(struct pipe_context *pipe,
+                       struct pipe_resource *res,
+                       unsigned offset, unsigned size,
+                       const void *data, int data_size)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   unsigned tmp;
+
+   if (data_size == 1) {
+      tmp = *(unsigned char *)data;
+      tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
+      data = &tmp;
+      data_size = 4;
+   } else if (data_size == 2) {
+      tmp = *(unsigned short *)data;
+      tmp = (tmp << 16) | tmp;
+      data = &tmp;
+      data_size = 4;
+   }
+
+   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
+      nvc0_clear_buffer_push_nvc0(pipe, res, offset, size, data, data_size);
+   else
+      nvc0_clear_buffer_push_nve4(pipe, res, offset, size, data, data_size);
 }

 static void
@ -402,10 +507,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
      memcpy(&color.ui, data, 16);
      break;
   case 12:
-      /* This doesn't work, RGB32 is not a valid RT format.
-       * dst_fmt = PIPE_FORMAT_R32G32B32_UINT;
-       * memcpy(&color.ui, data, 12);
-       * memset(&color.ui[3], 0, 4);
+      /* RGB32 is not a valid RT format. This will be handled by the pushbuf
+       * uploader.
       */
      break;
   case 8:
@ -437,14 +540,26 @@ nvc0_clear_buffer(struct pipe_context *pipe,
   assert(size % data_size == 0);

   if (data_size == 12) {
-      /* TODO: Find a way to do this with the GPU! */
-      nvc0_clear_buffer_cpu(pipe, res, offset, size, data, data_size);
+      nvc0_clear_buffer_push(pipe, res, offset, size, data, data_size);
      return;
   }

+   if (offset & 0xff) {
+      unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
+      assert(fixup_size % data_size == 0);
+      nvc0_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
+      offset += fixup_size;
+      size -= fixup_size;
+      if (!size)
+         return;
+   }
+
   elements = size / data_size;
   height = (elements + 16383) / 16384;
   width = elements / height;
+   if (height > 1)
+      width &= ~0xff;
+   assert(width > 0);

   if (!PUSH_SPACE(push, 40))
      return;
@ -465,7 +580,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
   BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 9);
   PUSH_DATAh(push, buf->address + offset);
   PUSH_DATA (push, buf->address + offset);
-   PUSH_DATA (push, width * data_size);
+   PUSH_DATA (push, align(width * data_size, 0x100));
   PUSH_DATA (push, height);
   PUSH_DATA (push, nvc0_format_table[dst_fmt].rt);
   PUSH_DATA (push, NVC0_3D_RT_TILE_MODE_LINEAR);
@ -480,24 +595,20 @@ nvc0_clear_buffer(struct pipe_context *pipe,

   IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);

+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
   if (width * height != elements) {
      offset += width * height * data_size;
      width = elements - width * height;
-      height = 1;
-
-      BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 4);
-      PUSH_DATAh(push, buf->address + offset);
-      PUSH_DATA (push, buf->address + offset);
-      PUSH_DATA (push, width * data_size);
-      PUSH_DATA (push, height);
-
-      IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
+      nvc0_clear_buffer_push(pipe, res, offset, width * data_size,
+                             data, data_size);
   }

-   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
-
-   nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
-   nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
   nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@ -515,12 +515,12 @@ nve4_set_tex_handles(struct nvc0_context *nvc0)
      return;
   address = nvc0->screen->uniform_bo->offset + (5 << 16);

-   for (s = 0; s < 5; ++s, address += (1 << 9)) {
+   for (s = 0; s < 5; ++s, address += (1 << 10)) {
      uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
      if (!dirty)
         continue;
      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, 512);
+      PUSH_DATA (push, 1024);
      PUSH_DATAh(push, address);
      PUSH_DATA (push, address);
      do {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@ -334,7 +334,7 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)
      b = ve->pipe.vertex_buffer_index;
      vb = &nvc0->vtxbuf[b];

-      if (!vb->buffer) {
+      if (nvc0->vbo_user & (1 << b)) {
         if (!(nvc0->constant_vbos & (1 << b))) {
            if (ve->pipe.instance_divisor) {
               BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_DIVISOR(i)), 1);
@ -352,13 +352,13 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)

      if (unlikely(ve->pipe.instance_divisor)) {
         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 4);
-         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
         PUSH_DATAh(push, res->address + offset);
         PUSH_DATA (push, res->address + offset);
         PUSH_DATA (push, ve->pipe.instance_divisor);
      } else {
         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 3);
-         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
         PUSH_DATAh(push, res->address + offset);
         PUSH_DATA (push, res->address + offset);
      }
@ -382,7 +382,7 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
   unsigned b;
   const uint32_t mask = nvc0->vbo_user;

-   PUSH_SPACE(push, nvc0->num_vtxbufs * 8);
+   PUSH_SPACE(push, nvc0->num_vtxbufs * 8 + nvc0->vertex->num_elements);
   for (b = 0; b < nvc0->num_vtxbufs; ++b) {
      struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[b];
      struct nv04_resource *buf;
@ -395,6 +395,10 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
         }
         /* address/value set in nvc0_update_user_vbufs_shared */
         continue;
+      } else if (!vb->buffer) {
+         /* there can be holes in the vertex buffer lists */
+         IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
+         continue;
      }
      buf = nv04_resource(vb->buffer);
      offset = vb->buffer_offset;
@ -410,6 +414,12 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)

      BCTX_REFN(nvc0->bufctx_3d, VTX, buf, RD);
   }
+   /* If there are more elements than buffers, we might not have unset
+    * fetching on the later elements.
+    */
+   for (; b < nvc0->vertex->num_elements; ++b)
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
+
   if (nvc0->vbo_user)
      nvc0_update_user_vbufs_shared(nvc0);
 }
@ -680,7 +690,7 @@ nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,

   if (count & 1) {
      count--;
-      PUSH_SPACE(push, 1);
+      PUSH_SPACE(push, 2);
      BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
      PUSH_DATA (push, *map++);
   }
@ -779,7 +789,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
      PUSH_SPACE(push, 2);
      IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
-      nvc0_hw_query_fifo_wait(push, nvc0_query(so->pq));
+      nvc0_hw_query_fifo_wait(nvc0, nvc0_query(so->pq));
      if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
         IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);

@ -811,6 +821,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
   uint32_t offset = buf->offset + info->indirect_offset;

+   PUSH_SPACE(push, 7);
+
   /* must make FIFO wait for engines idle before continuing to process */
   if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) ||
       (buf_count && buf_count->fence_wr &&
@ -951,6 +963,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
   if (info->mode == PIPE_PRIM_PATCHES &&
       nvc0->state.patch_vertices != info->vertices_per_patch) {
      nvc0->state.patch_vertices = info->vertices_per_patch;
+      PUSH_SPACE(push, 1);
      IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
   }

@ -958,6 +971,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
   nvc0_state_validate(nvc0, ~0, 8);

   if (nvc0->vertprog->vp.need_draw_parameters) {
+      PUSH_SPACE(push, 9);
      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
      PUSH_DATA (push, 512);
      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
@ -979,6 +993,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
   }

   if (nvc0->cb_dirty) {
+      PUSH_SPACE(push, 1);
      IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
      nvc0->cb_dirty = false;
   }
@ -987,6 +1002,8 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
      if (!nvc0->textures_coherent[s])
         continue;

+      PUSH_SPACE(push, nvc0->num_textures[s] * 2);
+
      for (int i = 0; i < nvc0->num_textures[s]; ++i) {
         struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
         if (!(nvc0->textures_coherent[s] & (1 << i)))
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@ -210,6 +210,10 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_INVALIDATE_BUFFER:
        case PIPE_CAP_GENERATE_MIPMAP:
        case PIPE_CAP_STRING_MARKER:
+        case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+        case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+        case PIPE_CAP_QUERY_BUFFER_OBJECT:
+        case PIPE_CAP_QUERY_MEMORY_INFO:
            return 0;

        /* SWTCL-only features. */
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@ -225,7 +225,7 @@ void *evergreen_create_compute_state(
 		}
 	}
 #else
-	memset(&shader->binary, 0, sizeof(shader->binary));
+	radeon_shader_binary_init(&shader->binary);
 	radeon_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);

@ -245,13 +245,31 @@ void *evergreen_create_compute_state(
 	return shader;
 }

-void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
+void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 {
-	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
+	struct r600_pipe_compute *shader = state;

 	if (!shader)
 		return;

+#ifdef HAVE_OPENCL
+#if HAVE_LLVM < 0x0306
+	for (unsigned i = 0; i < shader->num_kernels; i++) {
+		struct r600_kernel *kernel = &shader->kernels[i];
+		LLVMDisposeModule(module);
+	}
+	FREE(shader->kernels);
+	LLVMContextDispose(shader->llvm_ctx);
+#else
+	radeon_shader_binary_clean(&shader->binary);
+	r600_destroy_shader(&shader->bc);
+
+	/* TODO destroy shader->code_bo, shader->const_bo
+	 * we'll need something like r600_buffer_free */
+#endif
+#endif
 	FREE(shader);
 }

@ -349,7 +367,7 @@ static void evergreen_emit_direct_dispatch(
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned num_waves;
-	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
+	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 	unsigned wave_divisor = (16 * num_pipes);
 	int group_size = 1;
 	int grid_size = 1;
@ -723,7 +741,7 @@ static void evergreen_set_global_binding(
 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 * packet requires that the shader type bit be set, we must initialize all
 * context registers needed for compute in this function.  The registers
- * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
+ * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 * on the GPU family.
 */
@ -733,7 +751,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 	int num_threads;
 	int num_stack_entries;

-	/* since all required registers are initialised in the
+	/* since all required registers are initialized in the
 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
 	 */
 	r600_init_command_buffer(cb, 256);
@ -818,7 +836,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
 		 */

-		/* XXX: We may need to adjust the thread and stack resouce
+		/* XXX: We may need to adjust the thread and stack resource
 		 * values for 3D/compute interop */

 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@ -772,7 +772,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		if (util_format_get_blocksize(pipe_format) >= 16)
 			non_disp_tiling = 1;
 	}
-	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rscreen->b.info.r600_num_banks);

 	if (state->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@ -986,7 +986,7 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx,
 	unsigned block_size =
 		align(util_format_get_blocksize(pipe_buffer->format), 4);
 	unsigned pitch_alignment =
-		MAX2(64, rctx->screen->b.tiling_info.group_bytes / block_size);
+		MAX2(64, rctx->screen->b.info.pipe_interleave_bytes / block_size);
 	unsigned pitch = align(pipe_buffer->width0, pitch_alignment);

 	/* XXX: This is copied from evergreen_init_color_surface().  I don't
@ -1098,7 +1098,7 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 		if (util_format_get_blocksize(surf->base.format) >= 16)
 			non_disp_tiling = 1;
 	}
-	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rscreen->b.info.r600_num_banks);
 	desc = util_format_description(surf->base.format);
 	for (i = 0; i < 4; i++) {
 		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
@ -1253,7 +1253,7 @@ static void evergreen_init_depth_surface(struct r600_context *rctx,
 	macro_aspect = eg_macro_tile_aspect(macro_aspect);
 	bankw = eg_bank_wh(bankw);
 	bankh = eg_bank_wh(bankh);
-	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rscreen->b.info.r600_num_banks);
 	offset >>= 8;

 	surf->db_z_info = S_028040_ARRAY_MODE(array_mode) |
@ -3467,7 +3467,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 	sub_cmd = EG_DMA_COPY_TILED;
 	lbpp = util_logbase2(bpp);
 	pitch_tile_max = ((pitch / bpp) / 8) - 1;
-	nbanks = eg_num_banks(rctx->screen->b.tiling_info.num_banks);
+	nbanks = eg_num_banks(rctx->screen->b.info.r600_num_banks);

 	if (dst_mode == RADEON_SURF_MODE_LINEAR) {
 		/* T2L */
@ -3670,9 +3670,9 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	unsigned id = 1;
 	unsigned i;
 	/* !!!
-	 *  To avoid GPU lockup registers must be emited in a specific order
+	 *  To avoid GPU lockup registers must be emitted in a specific order
 	 * (no kidding ...). The order below is important and have been
-	 * partialy infered from analyzing fglrx command stream.
+	 * partially inferred from analyzing fglrx command stream.
 	 *
 	 * Don't reorder atom without carefully checking the effect (GPU lockup
 	 * or piglit regression).
@ -3793,7 +3793,7 @@ void evergreen_setup_tess_constants(struct r600_context *rctx, const struct pipe
 	unsigned output_patch0_offset, perpatch_output_offset, lds_size;
 	uint32_t values[16];
 	unsigned num_waves;
-	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
+	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 	unsigned wave_divisor = (16 * num_pipes);

 	*num_patches = 1;
--- a/Show more
+++ b/Show more