From c704b89fe49dd3f80ff0028bece93ea45b651987 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.velikov@collabora.com>
Date: Tue, 29 Dec 2015 21:02:54 +1100
Subject: [PATCH 001/241] glsl: annotate
 ast_process_struct_or_iface_block_members() as static

Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/ast_to_hir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 1091c022703..e1baadaf65f 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -6169,7 +6169,7 @@ ast_type_specifier::hir(exec_list *instructions,
  * The number of fields processed.  A pointer to the array structure fields is
  * stored in \c *fields_ret.
  */
-unsigned
+static unsigned
 ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           struct _mesa_glsl_parse_state *state,
                                           exec_list *declarations,

From 2c1a215409e6d5b3bc95854e17da2412a978e0c9 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.velikov@collabora.com>
Date: Tue, 29 Dec 2015 21:02:55 +1100
Subject: [PATCH 002/241] glsl/linker: annotate static functions as such

Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_uniform_block_active_visitor.cpp | 4 ++--
 src/glsl/link_uniform_blocks.cpp               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp
index 422739af063..54fea700b53 100644
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -24,7 +24,7 @@
 #include "link_uniform_block_active_visitor.h"
 #include "program.h"
 
-link_uniform_block_active *
+static link_uniform_block_active *
 process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
 {
    const hash_entry *const existing_block =
@@ -92,7 +92,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
  * and not over complicating the code we will end up with a count of 8.
  * Here each dimension has 2 different indices counted so we end up with 2*2*2
  */
-struct uniform_block_array_elements **
+static struct uniform_block_array_elements **
 process_arrays(void *mem_ctx, ir_dereference_array *ir,
                struct link_uniform_block_active *block)
 {
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp
index d5d30bb0a0d..7d755765852 100644
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -266,7 +266,7 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
 /* This function resizes the array types of the block so that later we can use
  * this new size to correctly calculate the offest for indirect indexing.
  */
-const glsl_type *
+static const glsl_type *
 resize_block_array(const glsl_type *type,
                    struct uniform_block_array_elements *ub_array)
 {

From 0d4cd045c8a74efd7d43ceb841bde43cc6eee65d Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 29 Dec 2015 21:02:56 +1100
Subject: [PATCH 003/241] glsl: tidy up struct with a single member

There used to be more members but they now share other fields
in order to keep memory use low.

Also making the naming more generic will allow us to reuse the
field for explicit byte offsets within blocks for
ARB_enhanced_layouts.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/ast_to_hir.cpp          |  2 +-
 src/glsl/ir.cpp                  |  2 +-
 src/glsl/ir.h                    |  4 +---
 src/glsl/link_atomics.cpp        | 16 ++++++++--------
 src/glsl/linker.cpp              |  2 +-
 src/glsl/nir/glsl_to_nir.cpp     |  2 +-
 src/glsl/nir/nir.h               |  4 +---
 src/glsl/nir/nir_lower_atomics.c |  2 +-
 8 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index e1baadaf65f..dbf05ac9999 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3105,7 +3105,7 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                _mesa_glsl_error(loc, state,
                                 "misaligned atomic counter offset");
 
-            var->data.atomic.offset = *offset;
+            var->data.offset = *offset;
             *offset += var->type->atomic_size();
 
          } else {
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 70227070ca7..d82bccd5d2f 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1674,7 +1674,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->data.mode = mode;
    this->data.interpolation = INTERP_QUALIFIER_NONE;
    this->data.max_array_access = 0;
-   this->data.atomic.offset = 0;
+   this->data.offset = 0;
    this->data.precision = GLSL_PRECISION_NONE;
    this->data.image_read_only = false;
    this->data.image_write_only = false;
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 159f94d9edd..c56c95994b8 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -871,9 +871,7 @@ public:
       /**
        * Location an atomic counter is stored at.
        */
-      struct {
-         unsigned offset;
-      } atomic;
+      unsigned offset;
 
       /**
        * Highest element accessed with a constant expression array index
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp
index 3aa52dbd18a..277d4737ab7 100644
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -83,16 +83,16 @@ namespace {
       const active_atomic_counter *const first = (active_atomic_counter *) a;
       const active_atomic_counter *const second = (active_atomic_counter *) b;
 
-      return int(first->var->data.atomic.offset) - int(second->var->data.atomic.offset);
+      return int(first->var->data.offset) - int(second->var->data.offset);
    }
 
    bool
    check_atomic_counters_overlap(const ir_variable *x, const ir_variable *y)
    {
-      return ((x->data.atomic.offset >= y->data.atomic.offset &&
-               x->data.atomic.offset < y->data.atomic.offset + y->type->atomic_size()) ||
-              (y->data.atomic.offset >= x->data.atomic.offset &&
-               y->data.atomic.offset < x->data.atomic.offset + x->type->atomic_size()));
+      return ((x->data.offset >= y->data.offset &&
+               x->data.offset < y->data.offset + y->type->atomic_size()) ||
+              (y->data.offset >= x->data.offset &&
+               y->data.offset < x->data.offset + x->type->atomic_size()));
    }
 
    void
@@ -158,7 +158,7 @@ namespace {
             ir_variable *var = node->as_variable();
 
             if (var && var->type->contains_atomic()) {
-               int offset = var->data.atomic.offset;
+               int offset = var->data.offset;
                unsigned uniform_loc = var->data.location;
                process_atomic_variable(var->type, prog, &uniform_loc,
                                        var, buffers, num_buffers, &offset, i);
@@ -185,7 +185,7 @@ namespace {
                linker_error(prog, "Atomic counter %s declared at offset %d "
                             "which is already in use.",
                             buffers[i].counters[j].var->name,
-                            buffers[i].counters[j].var->data.atomic.offset);
+                            buffers[i].counters[j].var->data.offset);
             }
          }
       }
@@ -237,7 +237,7 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
             var->data.binding = i;
 
          storage->atomic_buffer_index = i;
-         storage->offset = var->data.atomic.offset;
+         storage->offset = var->data.offset;
          storage->array_stride = (var->type->is_array() ?
                                   var->type->without_array()->atomic_size() : 0);
          if (!var->type->is_matrix())
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index c7e69765335..a6e81b41f3c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1014,7 +1014,7 @@ cross_validate_globals(struct gl_shader_program *prog,
             }
 
             if (var->type->contains_atomic() &&
-                var->data.atomic.offset != existing->data.atomic.offset) {
+                var->data.offset != existing->data.offset) {
                linker_error(prog, "offset specifications for %s "
                             "`%s' have differing values\n",
                             mode_string(var), var->name);
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 0d1d0f4e282..12efb4419b2 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -389,7 +389,7 @@ nir_visitor::visit(ir_variable *ir)
 
    var->data.index = ir->data.index;
    var->data.binding = ir->data.binding;
-   var->data.atomic.offset = ir->data.atomic.offset;
+   var->data.offset = ir->data.offset;
    var->data.image.read_only = ir->data.image_read_only;
    var->data.image.write_only = ir->data.image_write_only;
    var->data.image.coherent = ir->data.image_coherent;
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 562c5c5cc8c..a8972ac6ad5 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -291,9 +291,7 @@ typedef struct {
       /**
        * Location an atomic counter is stored at.
        */
-      struct {
-         unsigned offset;
-      } atomic;
+      unsigned offset;
 
       /**
        * ARB_shader_image_load_store qualifiers.
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index 259c154149b..1aa78e18a85 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -74,7 +74,7 @@ lower_instr(nir_intrinsic_instr *instr,
       state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index;
 
    nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
-   offset_const->value.u[0] = instr->variables[0]->var->data.atomic.offset;
+   offset_const->value.u[0] = instr->variables[0]->var->data.offset;
 
    nir_instr_insert_before(&instr->instr, &offset_const->instr);
 

From 97685ff10e0f866d809fc1e8f115fb6e92ce717c Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Tue, 29 Dec 2015 16:15:45 +0100
Subject: [PATCH 004/241] i965/gen8: Always use BRW_REGISTER_TYPE_UW for MUL on
 GEN8+

The imulExtended tests of the shader bitfield tests of the
OpenGL ES 3.1 CTS, fail on gen8+, when BRW_REGISTER_TYPE_W
is used for SHADER_OPECODE_MULH.

Also, remove unused helper function:
static inline bool type_is_signed(unsigned type)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92595
Signed-off-by: Marta Lofstedt <marta.lofstedt@linux.intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp |  3 +--
 src/mesa/drivers/dri/i965/brw_reg.h  | 27 ---------------------------
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 286ee0ed4e7..d4205ba66ad 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3439,8 +3439,7 @@ fs_visitor::lower_integer_multiplication()
              */
             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
-            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
-                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].type = BRW_REGISTER_TYPE_UW;
             mul->src[1].stride *= 2;
 
          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 9f2ff9ae5ad..a2a4a40f373 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -287,33 +287,6 @@ type_sz(unsigned type)
    }
 }
 
-static inline bool
-type_is_signed(unsigned type)
-{
-   switch(type) {
-   case BRW_REGISTER_TYPE_D:
-   case BRW_REGISTER_TYPE_W:
-   case BRW_REGISTER_TYPE_F:
-   case BRW_REGISTER_TYPE_B:
-   case BRW_REGISTER_TYPE_V:
-   case BRW_REGISTER_TYPE_VF:
-   case BRW_REGISTER_TYPE_DF:
-   case BRW_REGISTER_TYPE_HF:
-   case BRW_REGISTER_TYPE_Q:
-      return true;
-
-   case BRW_REGISTER_TYPE_UD:
-   case BRW_REGISTER_TYPE_UW:
-   case BRW_REGISTER_TYPE_UB:
-   case BRW_REGISTER_TYPE_UV:
-   case BRW_REGISTER_TYPE_UQ:
-      return false;
-
-   default:
-      unreachable("not reached");
-   }
-}
-
 /**
  * Construct a brw_reg.
  * \param file      one of the BRW_x_REGISTER_FILE values

From d50e6128b815595f7918d6818e8a9cd20d53efd1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 6 Dec 2015 23:49:48 -0500
Subject: [PATCH 005/241] nv50/ir: attempt to do more constant folding on mad
 -> add conversion

The add might actually have a 0 as an argument, which would convert it
into a mov. Make sure to detect that. Also avoid the hack of putting the
immediate directly into the instruction, instead use a mov to put it
into place and let the later LoadPropagation pass place it if possible.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_peephole.cpp      | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index c2842c2186f..f5c590eef10 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -676,23 +676,22 @@ ConstantFolding::expr(Instruction *i,
    switch (i->op) {
    case OP_MAD:
    case OP_FMA: {
-      i->op = OP_ADD;
+      ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
 
-      /* Move the immediate to the second arg, otherwise the ADD operation
-       * won't be emittable
-       */
-      i->setSrc(1, i->getSrc(0));
+      // Move the immediate into position 1, where we know it might be
+      // emittable. However it might not be anyways, as there may be other
+      // restrictions, so move it into a separate LValue.
+      bld.setPosition(i, false);
+      i->op = OP_ADD;
+      i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
       i->setSrc(0, i->getSrc(2));
       i->src(0).mod = i->src(2).mod;
       i->setSrc(2, NULL);
 
-      ImmediateValue src0;
       if (i->src(0).getImmediate(src0))
-         expr(i, src0, *i->getSrc(1)->asImm());
-      if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
-         bld.setPosition(i, false);
-         i->setSrc(1, bld.loadImm(NULL, res.data.u32));
-      }
+         expr(i, src0, src1);
+      else
+         opnd(i, src1, 1);
       break;
    }
    case OP_PFETCH:

From bb52ea45cc731d4580d8a6c90f604023dc04ddcc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 29 Dec 2015 16:37:19 -0500
Subject: [PATCH 006/241] gallium: add baseinstance/drawid semantics

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_strings.c  |  2 ++
 src/gallium/docs/source/tgsi.rst           | 13 +++++++++++++
 src/gallium/include/pipe/p_shader_tokens.h |  4 +++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index fc29a2398aa..fd926b37c47 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -96,6 +96,8 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
    "TESSINNER",
    "VERTICESIN",
    "HELPER_INVOCATION",
+   "BASEINSTANCE",
+   "DRAWID",
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index e7b0c2f6377..955ece89da5 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2949,6 +2949,19 @@ invocation is covered or not. Helper invocations are created in order
 to properly compute derivatives, however it may be desirable to skip
 some of the logic in those cases. See ``gl_HelperInvocation`` documentation.
 
+TGSI_SEMANTIC_BASEINSTANCE
+""""""""""""""""""""""""""
+
+For vertex shaders, the base instance argument supplied for this
+draw. This is an integer value, and only the X component is used.
+
+TGSI_SEMANTIC_DRAWID
+""""""""""""""""""""
+
+For vertex shaders, the zero-based index of the current draw in a
+``glMultiDraw*`` invocation. This is an integer value, and only the X
+component is used.
+
 
 Declaration Interpolate
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index a3137aec8db..e8f4ad210e1 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -186,7 +186,9 @@ struct tgsi_declaration_interp
 #define TGSI_SEMANTIC_TESSINNER  33 /**< inner tessellation levels */
 #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */
 #define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */
-#define TGSI_SEMANTIC_COUNT      36 /**< number of semantic values */
+#define TGSI_SEMANTIC_BASEINSTANCE 36
+#define TGSI_SEMANTIC_DRAWID     37
+#define TGSI_SEMANTIC_COUNT      38 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {

From 87b4e4e29fb2013ce2f0770f39113069bd16057c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 29 Dec 2015 16:49:32 -0500
Subject: [PATCH 007/241] gallium: add PIPE_CAP_DRAW_PARAMETERS

This allows the state tracker to know that the various draw parameters
are available in vertex shaders.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/docs/source/screen.rst               | 6 ++++--
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/drivers/virgl/virgl_screen.c         | 1 +
 src/gallium/include/pipe/p_defines.h             | 1 +
 16 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index e900283f731..41bd0f81e0e 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -239,8 +239,7 @@ The integer capabilities:
   will need to lower TGSI_SEMANTIC_VERTEXID to TGSI_SEMANTIC_VERTEXID_NOBASE
   and TGSI_SEMANTIC_BASEVERTEX, so drivers setting this must handle both these
   semantics. Only relevant if geometry shaders are supported.
-  (Currently not possible to query availability of these two semantics outside
-  this, at least BASEVERTEX should be exposed separately too).
+  (BASEVERTEX could be exposed separately too via ``PIPE_CAP_DRAW_PARAMETERS``).
 * ``PIPE_CAP_POLYGON_OFFSET_CLAMP``: If true, the driver implements support
   for ``pipe_rasterizer_state::offset_clamp``.
 * ``PIPE_CAP_MULTISAMPLE_Z_RESOLVE``: Whether the driver supports blitting
@@ -283,6 +282,9 @@ The integer capabilities:
   a compressed block is copied to/from a plain pixel of the same size.
 * ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be
   available in contexts.
+* ``PIPE_CAP_DRAW_PARAMETERS``: Whether ``TGSI_SEMANTIC_BASEVERTEX``,
+  ``TGSI_SEMANTIC_BASEINSTANCE``, and ``TGSI_SEMANTIC_DRAWID`` are
+  supported in vertex shaders.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 5bbe4016a2a..4b6d6af0837 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -238,6 +238,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index a5b161882cd..14bd8d797d2 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -254,6 +254,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index cfa2fb41152..ac29b5605bd 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -476,6 +476,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 899f28da7d3..535296317ab 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -301,6 +301,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 854f70cf34c..02303bb79f2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -174,6 +174,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 272e1d45bff..b3f2492fe64 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -217,6 +217,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 39954464b9c..500510a0870 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -206,6 +206,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 606e25f915b..e3f2505e139 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -200,6 +200,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
         case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 17006f70601..8208686dcb0 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -348,6 +348,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ac13407e2a1..fda5a1eed0b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -340,6 +340,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 9939720e259..7a35a2a81fb 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -251,6 +251,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index fca501bc47d..fe8e75e1f66 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -384,6 +384,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 8ddf0865d21..e845dfdedfe 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -189,6 +189,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
+	case PIPE_CAP_DRAW_PARAMETERS:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 26a4f7736e3..668f2e01cdd 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -219,6 +219,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index cbf0ba617be..d4933e74884 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -635,6 +635,7 @@ enum pipe_cap
    PIPE_CAP_SHAREABLE_SHADERS,
    PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
    PIPE_CAP_CLEAR_TEXTURE,
+   PIPE_CAP_DRAW_PARAMETERS,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From daaf0bdf46bc5084bdba1ee5d42d994f30a65841 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 29 Dec 2015 17:00:05 -0500
Subject: [PATCH 008/241] gallium: add a drawid to pipe_draw_info

This will allow the state tracker to inform the driver where in a
broken-up multidraw we currently are. This can then be passed into the
vertex shader.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/include/pipe/p_state.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 6bdf03a8b2b..84633633f55 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -587,6 +587,8 @@ struct pipe_draw_info
    unsigned start_instance; /**< first instance id */
    unsigned instance_count; /**< number of instances */
 
+   unsigned drawid; /**< id of this draw in a multidraw */
+
    unsigned vertices_per_patch; /**< the number of vertices per patch */
 
    /**

From 89bda9772d5b6f736d5f18e90a1ee4056438fe42 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 29 Dec 2015 16:39:16 -0500
Subject: [PATCH 009/241] st/mesa: add GL_ARB_shader_draw_parameters support

Hooks up the new system values, passes the drawid in.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/state_tracker/st_draw.c           | 1 +
 src/mesa/state_tracker/st_extensions.c     | 1 +
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index f4b273bf93f..635a0126834 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -266,6 +266,7 @@ st_draw_vbo(struct gl_context *ctx,
       info.instance_count = prims[i].num_instances;
       info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
       info.index_bias = prims[i].basevertex;
+      info.drawid = prims[i].draw_id;
       if (!ib) {
          info.min_index = info.start;
          info.max_index = info.start + info.count - 1;
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index a2418e28a91..a0168d8e408 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -454,6 +454,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
       { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
       { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
+      { o(ARB_shader_draw_parameters),       PIPE_CAP_DRAW_PARAMETERS                  },
       { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
       { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS                        },
       { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 5a6be08185f..160838c498e 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4328,8 +4328,8 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_INSTANCEID,
    TGSI_SEMANTIC_VERTEXID_NOBASE,
    TGSI_SEMANTIC_BASEVERTEX,
-   0, /* SYSTEM_VALUE_BASE_INSTANCE */
-   0, /* SYSTEM_VALUE_DRAW_ID */
+   TGSI_SEMANTIC_BASEINSTANCE,
+   TGSI_SEMANTIC_DRAWID,
 
    /* Geometry shader
     */

From 517a93b346e720082e22e358b63b5dbc5c42aa09 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 30 Dec 2015 14:50:02 -0500
Subject: [PATCH 010/241] nvc0: add ARB_shader_draw_parameters support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 docs/relnotes/11.2.0.html                     |  1 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  3 +++
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  4 +++-
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 10 +++++++++-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 11 ++++++++++
 .../nouveau/codegen/nv50_ir_target_nvc0.cpp   |  3 +++
 .../drivers/nouveau/nouveau_compiler.c        |  2 +-
 .../drivers/nouveau/nv50/nv50_program.c       |  2 +-
 .../drivers/nouveau/nvc0/mme/com9097.mme      |  8 ++++++--
 .../drivers/nouveau/nvc0/mme/com9097.mme.h    | 18 ++++++++++-------
 .../drivers/nouveau/nvc0/nvc0_program.c       |  4 +++-
 .../drivers/nouveau/nvc0/nvc0_program.h       |  1 +
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   | 20 +++++++++++++++++++
 14 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html
index 23bb31c6235..d31da8ba4a6 100644
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -47,6 +47,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_base_instance on freedreno/a4xx</li>
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
+<li>GL_ARB_shader_draw_parameters on i965, nvc0</li>
 <li>GL_ARB_tessellation_shader on i965 and r600 (evergreen/cayman only)</li>
 <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
 <li>GL_ARB_texture_buffer_range on freedreno/a4xx</li>
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index d09a0ab0610..d1fdd75495f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -390,6 +390,9 @@ enum SVSemantic
    SV_VERTEX_STRIDE,
    SV_INVOCATION_INFO,
    SV_THREAD_KILL,
+   SV_BASEVERTEX,
+   SV_BASEINSTANCE,
+   SV_DRAWID,
    SV_UNDEFINED,
    SV_LAST
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index b49bf9d53bc..4504240ac5e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -124,6 +124,7 @@ struct nv50_ir_prog_info
    union {
       struct {
          uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+         bool usesDrawParameters;
       } vp;
       struct {
          uint8_t inputPatchSize;
@@ -160,8 +161,9 @@ struct nv50_ir_prog_info
       uint8_t clipDistances;     /* number of clip distance outputs */
       uint8_t cullDistances;     /* number of cull distance outputs */
       int8_t genUserClip;        /* request user clip planes for ClipVertex */
+      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
       uint16_t ucpBase;          /* base address for UCPs */
-      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
+      uint16_t drawInfoBase;     /* base address for draw parameters */
       uint8_t pointSize;         /* output index for PointSize */
       uint8_t instanceId;        /* system value index of InstanceID */
       uint8_t vertexId;          /* system value index of VertexID */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index b23386040a7..beb67fe20f1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -377,6 +377,9 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
    case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
+   case TGSI_SEMANTIC_BASEVERTEX: return nv50_ir::SV_BASEVERTEX;
+   case TGSI_SEMANTIC_BASEINSTANCE: return nv50_ir::SV_BASEINSTANCE;
+   case TGSI_SEMANTIC_DRAWID:     return nv50_ir::SV_DRAWID;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
@@ -1128,6 +1131,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       case TGSI_SEMANTIC_SAMPLEPOS:
          info->prop.fp.sampleInterp = 1;
          break;
+      case TGSI_SEMANTIC_BASEVERTEX:
+      case TGSI_SEMANTIC_BASEINSTANCE:
+      case TGSI_SEMANTIC_DRAWID:
+         info->prop.vp.usesDrawParameters = true;
+         break;
       default:
          break;
       }
@@ -3252,7 +3260,7 @@ Converter::handleUserClipPlanes()
 
    for (c = 0; c < 4; ++c) {
       for (i = 0; i < info->io.genUserClip; ++i) {
-         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot,
                                 TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
          Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
          if (c == 0)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e67bf3eca84..6530078b938 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1576,6 +1576,17 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
       break;
+   case SV_BASEVERTEX:
+   case SV_BASEINSTANCE:
+   case SV_DRAWID:
+      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
+                      bld.mkSymbol(FILE_MEMORY_CONST,
+                                   prog->driver->io.auxCBSlot,
+                                   TYPE_U32,
+                                   prog->driver->io.drawInfoBase +
+                                   4 * (sv - SV_BASEVERTEX)),
+                      NULL);
+      break;
    default:
       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 19637ce33f5..014c652eede 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -295,6 +295,9 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
    case SV_SAMPLE_INDEX:   return 0;
    case SV_SAMPLE_POS:     return 0;
    case SV_SAMPLE_MASK:    return 0;
+   case SV_BASEVERTEX:     return 0;
+   case SV_BASEINSTANCE:   return 0;
+   case SV_DRAWID:         return 0;
    default:
       return 0xffffffff;
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index 670b0c8b135..cd44aa1e1d9 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -112,7 +112,7 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
    info.bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info.bin.source = tokens;
 
-   info.io.ucpCBSlot = 15;
+   info.io.auxCBSlot = 15;
    info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
 
    info.io.resInfoCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index a4b8ddfda95..b63584e0a09 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -335,7 +335,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info->bin.source = (void *)prog->pipe.tokens;
 
-   info->io.ucpCBSlot = 15;
+   info->io.auxCBSlot = 15;
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index b2060d1fa53..8c647d0c66c 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -241,8 +241,10 @@ locn_0f_ts:
    parm $r2 /* instance_count */
    parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
    parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x8e4 /* CB_DATA */
    braz $r2 #dei_end
-   parm $r5 /* start_instance */
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
    read $r6 0x50d /* VB_ELEMENT_BASE */
    read $r7 0x50e /* VB_INSTANCE_BASE */
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
@@ -283,8 +285,10 @@ dei_end:
    parm $r2 /* count */
    parm $r3 /* instance_count */
    parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
-   parm $r4 send $r4 /* start_instance */
    braz $r3 #dai_end
+   parm $r4 send $r4 /* start_instance */
+   maddrsend 0x8e4 /* CB_DATA, send 0 as base_vertex */
+   send $r4 /* send start_instance */
    read $r6 0x50e /* VB_INSTANCE_BASE */
    maddr 0x50e /* VB_INSTANCE_BASE */
    mov $r5 0x1
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index bac9042c2df..acad303ce60 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -128,11 +128,13 @@ uint32_t mme9097_draw_elts_indirect[] = {
 	0x00000301,
 	0x00000201,
 	0x017dc451,
-/* 0x000e: dei_again */
 	0x00002431,
-	0x0005d007,
-	0x00000501,
-/* 0x001b: dei_end */
+/* 0x0010: dei_again */
+	0x02390021,
+	0x00061007,
+	0x00002531,
+/* 0x001d: dei_end */
+	0x00002841,
 	0x01434615,
 	0x01438715,
 	0x05434021,
@@ -161,11 +163,13 @@ uint32_t mme9097_draw_elts_indirect[] = {
 uint32_t mme9097_draw_arrays_indirect[] = {
 	0x00000201,
 	0x00000301,
-/* 0x0009: dai_again */
+/* 0x000b: dai_again */
 	0x00d74451,
+	0x00049807,
 	0x00002431,
-/* 0x0013: dai_end */
-	0x0003d807,
+/* 0x0015: dai_end */
+	0x02390071,
+	0x00002041,
 	0x01438615,
 	0x01438021,
 	0x00004511,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 67a25acf778..730d6feac69 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -533,8 +533,9 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    info->bin.source = (void *)prog->pipe.tokens;
 
    info->io.genUserClip = prog->vp.num_ucps;
+   info->io.auxCBSlot = 15;
    info->io.ucpBase = 256;
-   info->io.ucpCBSlot = 15;
+   info->io.drawInfoBase = 256 + 128;
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
@@ -583,6 +584,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    prog->num_barriers = info->numBarriers;
 
    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+   prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters;
 
    if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
       info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 9c45e7b3e31..8b8d221edfc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -42,6 +42,7 @@ struct nvc0_program {
       uint8_t num_ucps; /* also set to max if ClipDistance is used */
       uint8_t edgeflag; /* attribute index of edgeflag input */
       bool need_vertex_id;
+      bool need_draw_parameters;
    } vp;
    struct {
       uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 500510a0870..f029d164436 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -184,6 +184,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -206,7 +207,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_DRAW_PARAMETERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 54443bdccc0..273451e638d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -814,6 +814,14 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
       IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);
 
+   /* Queue things up to let the macros write params to the driver constbuf */
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, 512);
+   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+   BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
+   PUSH_DATA (push, 256 + 128);
+
    PUSH_SPACE(push, 8);
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
@@ -901,6 +909,18 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    /* 8 as minimum to avoid immediate double validation of new buffers */
    nvc0_state_validate(nvc0, ~0, 8);
 
+   if (nvc0->vertprog->vp.need_draw_parameters) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 512);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+      PUSH_DATA (push, 256 + 128);
+      PUSH_DATA (push, info->index_bias);
+      PUSH_DATA (push, info->start_instance);
+      PUSH_DATA (push, info->drawid);
+   }
+
    push->kick_notify = nvc0_draw_vbo_kick_notify;
 
    /* TODO: Instead of iterating over all the buffer resources looking for

From 5ac15f788be10999e06cfc0261fd61ab67e3da9c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 30 Dec 2015 18:47:18 -0500
Subject: [PATCH 011/241] glsl: add GL_ARB_shader_draw_parameters define

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 src/glsl/glcpp/glcpp-parse.y | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 2fd4cf04079..ef1a6575aaa 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2506,6 +2506,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 
               if (extensions->ARB_shader_subroutine)
                  add_builtin_define(parser, "GL_ARB_shader_subroutine", 1);
+
+              if (extensions->ARB_shader_draw_parameters)
+                 add_builtin_define(parser, "GL_ARB_shader_draw_parameters", 1);
 	   }
 	}
 

From 4acf71c89b5ef5e2fe8c1a3d7ecf6031e191463c Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 28 Dec 2015 14:20:28 -0800
Subject: [PATCH 012/241] drirc: Disable ARB_blend_func_extended for Heaven
 4.0/Valley 1.0.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unigine Heaven 4.0 and Valley 1.0 use dual color blending but don't
specify which fragment shader output is which, so there's at best a
50/50 chance of us guessing it correctly.  This is invalid.

Unigine fixed this in 4.1 and 1.1 versions over a year and a half ago,
but hasn't actually released them for whatever reason.  So, add the
workaround back so that it works for most people.

Fixes Heaven 4.0/Valley 1.0 rendering on Ivybridge.  For whatever
reason, Broadwell worked.  4.1 and 1.1 have always worked.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92233
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/mesa/drivers/dri/common/drirc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index bb840eaba94..e1874c3f1dc 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -37,18 +37,26 @@ TODO: document the other workarounds.
 
         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 4.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine Heaven (64-bit)" executable="heaven_x64">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 4.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine Valley (32-bit)" executable="valley_x86">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 1.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine Valley (64-bit)" executable="valley_x64">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 1.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
 	</application>
 
         <application name="Unigine OilRush (32-bit)" executable="OilRush_x86">

From 7cdc2b9ca0ab60b282416b975a2ac6d7abcd42ad Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 30 Dec 2015 02:33:00 -0800
Subject: [PATCH 013/241] glsl: Fix varying struct locations when varying
 packing is disabled.

varying_matches::record tries to compute the number of components in
each varying, which varying_matches::assign_locations uses to assign
locations.  With varying packing, it uses glsl_type::component_slots()
to come up with a reasonable value.

Without varying packing, it fell back to an open-coded computation
that didn't bother to handle structs at all.  I believe we can simply
use 4 * glsl_type::count_attribute_slots(false), which already handles
these cases correctly.

Partially fixes rendering in GFXBench 4.0's tessellation benchmark.
(NVE0 is almost right after this, but i965 is still mostly garbage.)

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
---
 src/glsl/link_varyings.cpp | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 9cc77feb78a..1ff25b85253 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -959,19 +959,8 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
          type = type->fields.array;
       }
 
-      if (type->is_array()) {
-         slots = 1;
-         while (type->is_array()) {
-            slots *= type->length;
-            type = type->fields.array;
-         }
-         slots *= type->matrix_columns;
-      } else {
-         slots = type->matrix_columns;
-      }
-      if (type->without_array()->is_dual_slot_double())
-         slots *= 2;
-      this->matches[this->num_matches].num_components = 4 * slots;
+      slots = type->count_attribute_slots(false);
+      this->matches[this->num_matches].num_components = slots * 4;
    } else {
       this->matches[this->num_matches].num_components
          = var->type->component_slots();

From 65d3f85eb3efb326a826c2db0225340d5421a389 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 30 Dec 2015 02:53:08 -0800
Subject: [PATCH 014/241] nvc0: Set winding order regardless of domain.

Quads need to respect winding order, too - not just triangles.

Fixes rendering in GFXBench 4.0's tessellation benchmark.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 730d6feac69..7cb86e3b905 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -285,8 +285,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       break;
    case PIPE_PRIM_TRIANGLES:
       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
-      if (info->prop.tp.winding > 0)
-         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
       break;
    case PIPE_PRIM_QUADS:
       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
@@ -295,6 +293,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       tp->tp.tess_mode = ~0;
       return;
    }
+
+   if (info->prop.tp.winding > 0)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+
    if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
 

From fc890d703ee079b1eb37c316f8ba8554b3184248 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:11 +1000
Subject: [PATCH 015/241] st/glsl_to_tgsi: store if dst is double in array

This is just a precursor patch to a fix for doubles with
tessellation that I've written.

We need to descend into output arrays in that case and
mark dst's as double.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 160838c498e..163f6ea8066 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -555,6 +555,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 {
    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
    int num_reladdr = 0, i, j;
+   bool dst_is_double[2];
 
    op = get_opcode(ir, op, dst, src0, src1);
 
@@ -658,7 +659,13 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     * GLSL [0].z -> TGSI [1].xy
     * GLSL [0].w -> TGSI [1].zw
     */
-   if (inst->dst[0].type == GLSL_TYPE_DOUBLE || inst->dst[1].type == GLSL_TYPE_DOUBLE ||
+   for (j = 0; j < 2; j++) {
+      dst_is_double[j] = false;
+      if (inst->dst[j].type == GLSL_TYPE_DOUBLE)
+         dst_is_double[j] = true;
+   }
+
+   if (dst_is_double[0] || dst_is_double[1] ||
        inst->src[0].type == GLSL_TYPE_DOUBLE) {
       glsl_to_tgsi_instruction *dinst = NULL;
       int initial_src_swz[4], initial_src_idx[4];
@@ -699,7 +706,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
 
          /* modify the destination if we are splitting */
          for (j = 0; j < 2; j++) {
-            if (dinst->dst[j].type == GLSL_TYPE_DOUBLE) {
+            if (dst_is_double[j]) {
                dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
                dinst->dst[j].index = initial_dst_idx[j];
                if (i > 1)
@@ -732,7 +739,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
                   - F2D is a float src0, DLDEXP is integer src1 */
                if (op == TGSI_OPCODE_F2D ||
                    op == TGSI_OPCODE_DLDEXP ||
-                   (op == TGSI_OPCODE_UCMP && dinst->dst[0].type == GLSL_TYPE_DOUBLE)) {
+                   (op == TGSI_OPCODE_UCMP && dst_is_double[0])) {
                   dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
                }
             }

From aab0c6c9c400bc7f83516b29cf736fa7ce1f2a13 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:12 +1000
Subject: [PATCH 016/241] st/glsl_to_tgsi: handle doubles outputs in arrays.

This handles the case where a double output is stored
in an array, and tracks it for use in the double
instruction emit code.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 35 +++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 163f6ea8066..d152bf93485 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -334,8 +334,24 @@ struct array_decl {
    unsigned mesa_index;
    unsigned array_id;
    unsigned array_size;
+   unsigned array_type;
 };
 
+static unsigned
+find_array_type(struct array_decl *arrays, unsigned count, unsigned array_id)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      if (array_id == decl->array_id) {
+         return decl->array_type;
+      }
+   }
+   return GLSL_TYPE_ERROR;
+}
+
 struct rename_reg_pair {
    int old_reg;
    int new_reg;
@@ -663,6 +679,11 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
       dst_is_double[j] = false;
       if (inst->dst[j].type == GLSL_TYPE_DOUBLE)
          dst_is_double[j] = true;
+      else if (inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
+         unsigned type = find_array_type(this->output_arrays, this->num_output_arrays, inst->dst[j].array_id);
+         if (type == GLSL_TYPE_DOUBLE)
+            dst_is_double[j] = true;
+      }
    }
 
    if (dst_is_double[0] || dst_is_double[1] ||
@@ -2270,10 +2291,13 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
             decl->mesa_index = var->data.location;
             decl->array_id = num_input_arrays + 1;
-            if (is_2d)
+            if (is_2d) {
                decl->array_size = type_size(var->type->fields.array);
-            else
+               decl->array_type = var->type->fields.array->without_array()->base_type;
+            } else {
                decl->array_size = type_size(var->type);
+               decl->array_type = var->type->without_array()->base_type;
+            }
             num_input_arrays++;
 
             entry = new(mem_ctx) variable_storage(var,
@@ -2296,10 +2320,13 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
             decl->mesa_index = var->data.location;
             decl->array_id = num_output_arrays + 1;
-            if (is_2d)
+            if (is_2d) {
                decl->array_size = type_size(var->type->fields.array);
-            else
+               decl->array_type = var->type->fields.array->without_array()->base_type;
+            } else {
                decl->array_size = type_size(var->type);
+               decl->array_type = var->type->without_array()->base_type;
+            }
             num_output_arrays++;
 
             entry = new(mem_ctx) variable_storage(var,

From 14506dcae20d89ae9380c7a4f1843586c59db16d Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:13 +1000
Subject: [PATCH 017/241] st/glsl_to_tgsi: handle doubles in array shrinking
 code.

This code takes into account double inputs in the array
shrinking code. This fixes some issues with doubles
and geom/tess inputs.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index d152bf93485..6eb31b330b5 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2372,6 +2372,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 static void
 shrink_array_declarations(struct array_decl *arrays, unsigned count,
                           GLbitfield64 usage_mask,
+                          GLbitfield64 double_usage_mask,
                           GLbitfield patch_usage_mask)
 {
    unsigned i, j;
@@ -2392,6 +2393,8 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
          else {
             if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                break;
+            if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
+               break;
          }
 
          decl->mesa_index++;
@@ -2409,6 +2412,8 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
          else {
             if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                break;
+            if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
+               break;
          }
 
          decl->array_size--;
@@ -5587,9 +5592,9 @@ get_mesa_program(struct gl_context *ctx,
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
    shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead, prog->PatchInputsRead);
+                             prog->InputsRead, prog->DoubleInputsRead, prog->PatchInputsRead);
    shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten, prog->PatchOutputsWritten);
+                             prog->OutputsWritten, 0ULL, prog->PatchOutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */

From 7351c7684f75704dfef82a25177e6c5284d8bb0a Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:14 +1000
Subject: [PATCH 018/241] st/glsl_to_tgsi: setup writemask for double arrays
 and matricies.

It's important for the double instruction emission code that
the writemasks are correct going in for double so it know
which channels to replicate.

This fixes it for the array and matrix cases.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 6eb31b330b5..133ba3714a8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2754,7 +2754,26 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
     */
    if (ir->write_mask == 0) {
       assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
-      l.writemask = WRITEMASK_XYZW;
+
+      if (ir->lhs->type->is_array() || ir->lhs->type->without_array()->is_matrix()) {
+         if (ir->lhs->type->without_array()->is_double()) {
+            switch (ir->lhs->type->without_array()->vector_elements) {
+            case 1:
+               l.writemask = WRITEMASK_X;
+               break;
+            case 2:
+               l.writemask = WRITEMASK_XY;
+               break;
+            case 3:
+               l.writemask = WRITEMASK_XYZ;
+               break;
+            case 4:
+               l.writemask = WRITEMASK_XYZW;
+               break;
+            }
+         } else
+            l.writemask = WRITEMASK_XYZW;
+      }
    } else if (ir->lhs->type->is_scalar() &&
               !ir->lhs->type->is_double() &&
               ir->lhs->variable_referenced()->data.mode == ir_var_shader_out) {

From d87894b98f9b60cd7298229c0a1da72116f98047 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:15 +1000
Subject: [PATCH 019/241] st/glsl_to_tgsi: handle double immediates in matrices
 properly.

This handles matrix initialisation properly.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 59 ++++++++++++++++++----
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 133ba3714a8..4d87a3266b1 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2907,20 +2907,57 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
       st_dst_reg mat_column = st_dst_reg(mat);
 
       for (i = 0; i < ir->type->matrix_columns; i++) {
-         assert(ir->type->base_type == GLSL_TYPE_FLOAT);
-         values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
-
-         src = st_src_reg(file, -1, ir->type->base_type);
-         src.index = add_constant(file,
-                                  values,
-                                  ir->type->vector_elements,
-                                  GL_FLOAT,
-                                  &src.swizzle);
-         emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+         switch (ir->type->base_type) {
+         case GLSL_TYPE_FLOAT:
+            values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
 
+            src = st_src_reg(file, -1, ir->type->base_type);
+            src.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     GL_FLOAT,
+                                     &src.swizzle);
+            emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            break;
+         case GLSL_TYPE_DOUBLE:
+            values = (gl_constant_value *) &ir->value.d[i * ir->type->vector_elements];
+            src = st_src_reg(file, -1, ir->type->base_type);
+            src.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     GL_DOUBLE,
+                                     &src.swizzle);
+            if (ir->type->vector_elements >= 2) {
+               mat_column.writemask = WRITEMASK_XY;
+               src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+               emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            } else {
+               mat_column.writemask = WRITEMASK_X;
+               src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+               emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            }
+            src.index++;
+            if (ir->type->vector_elements > 2) {
+               if (ir->type->vector_elements == 4) {
+                  mat_column.writemask = WRITEMASK_ZW;
+                  src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+                  emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+               } else {
+                  mat_column.writemask = WRITEMASK_Z;
+                  src.swizzle = MAKE_SWIZZLE4(SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y);
+                  emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+                  mat_column.writemask = WRITEMASK_XYZW;
+                  src.swizzle = SWIZZLE_XYZW;
+               }
+               mat_column.index++;
+            }
+            break;
+         default:
+            unreachable("Illegal matrix constant type.\n");
+            break;
+         }
          mat_column.index++;
       }
-
       this->result = mat;
       return;
    }

From 84dbf3c4ff8976459eb0890d8affc8e521689858 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:16 +1000
Subject: [PATCH 020/241] st/glsl_to_tgsi: when doing reladdr get vec4 of
 correct type

This fixes fp64 relative addressing, in the upcoming
dmat-vs-gs-tcs-tes.shader_test.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 4d87a3266b1..f2ae58e9e96 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1441,7 +1441,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
    if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
 
    if (*num_reladdr != 1) {
-      st_src_reg temp = get_temp(glsl_type::vec4_type);
+      st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
 
       emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
       *reg = temp;

From dc7b33c1f372c835ce91afa0350c0bffe00c344e Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:17 +1000
Subject: [PATCH 021/241] st/glsl_to_tgsi: readd the double_reg2 for input
 index mapping

Otherwise we end up emitting the wrong index for the second
double.

This fixes dmat-vs-gs-tcs-tes.shader_test and dvec3-vs-gs-tcs-tes.shader_test

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index f2ae58e9e96..9f0efb85ed5 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4640,7 +4640,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
       if (!reg->array_id) {
          assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
          assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
-         return t->inputs[t->inputMapping[index]];
+         return t->inputs[t->inputMapping[index] + double_reg2];
       }
       else {
          struct array_decl *decl = &t->input_arrays[reg->array_id-1];
@@ -4649,7 +4649,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
 
          assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
          assert(t->inputs[slot].ArrayID == reg->array_id);
-         return ureg_src_array_offset(t->inputs[slot], index - mesa_index);
+         return ureg_src_array_offset(t->inputs[slot], index + double_reg2 - mesa_index);
       }
 
    case PROGRAM_ADDRESS:

From d214ce86cf0d5f5bd0135f1558194391e72501d0 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:18 +1000
Subject: [PATCH 022/241] st/glsl_to_tgsi: handle different attrib size

vertex inputs are counted differently in some cases, with
vertex inputs we need to make sure we don't double count them.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 9f0efb85ed5..d7b3e0eeb50 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1085,7 +1085,7 @@ glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
 }
 
 static int
-type_size(const struct glsl_type *type)
+attrib_type_size(const struct glsl_type *type, bool is_vs_input)
 {
    unsigned int i;
    int size;
@@ -1108,7 +1108,7 @@ type_size(const struct glsl_type *type)
       break;
    case GLSL_TYPE_DOUBLE:
       if (type->is_matrix()) {
-         if (type->vector_elements <= 2)
+         if (type->vector_elements <= 2 || is_vs_input)
             return type->matrix_columns;
          else
             return type->matrix_columns * 2;
@@ -1116,7 +1116,7 @@ type_size(const struct glsl_type *type)
          /* For doubles if we have a double or dvec2 they fit in one
           * vec4, else they need 2 vec4s.
           */
-         if (type->vector_elements <= 2)
+         if (type->vector_elements <= 2 || is_vs_input)
             return 1;
          else
             return 2;
@@ -1124,11 +1124,11 @@ type_size(const struct glsl_type *type)
       break;
    case GLSL_TYPE_ARRAY:
       assert(type->length > 0);
-      return type_size(type->fields.array) * type->length;
+      return attrib_type_size(type->fields.array, is_vs_input) * type->length;
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < type->length; i++) {
-         size += type_size(type->fields.structure[i].type);
+         size += attrib_type_size(type->fields.structure[i].type, is_vs_input);
       }
       return size;
    case GLSL_TYPE_SAMPLER:
@@ -1148,6 +1148,11 @@ type_size(const struct glsl_type *type)
    return 0;
 }
 
+static int
+type_size(const struct glsl_type *type)
+{
+  return attrib_type_size(type, false);
+}
 
 /**
  * If the given GLSL type is an array or matrix or a structure containing
@@ -2454,6 +2459,10 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
       element_size = 1;
 
    if (index) {
+
+      if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
+	  src.file == PROGRAM_INPUT)
+	 element_size = attrib_type_size(ir->type, true);
       if (is_2D) {
          src.index2D = index->value.i[0];
          src.has_index2 = true;

From b83525599290ab1226f64163cf13761223f17829 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 19 Dec 2015 14:43:19 +1000
Subject: [PATCH 023/241] st/glsl_to_tgsi: fix block movs for doubles

While playing with fp64, I disable varying packing to debug
something else, and noticed we never emitted half the output
movs for double matrix arrays.

We should be moving the left index two slots for dual
source doubles, and the right index two slots for non-vs
input doubles.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index d7b3e0eeb50..ad3a6846558 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -85,6 +85,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    st_src_reg(gl_register_file file, int index, int type)
@@ -100,6 +101,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    st_src_reg(gl_register_file file, int index, int type, int index2D)
@@ -115,6 +117,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    st_src_reg()
@@ -130,6 +133,7 @@ public:
       this->has_index2 = false;
       this->double_reg2 = false;
       this->array_id = 0;
+      this->is_double_vertex_input = false;
    }
 
    explicit st_src_reg(st_dst_reg reg);
@@ -150,6 +154,7 @@ public:
     */
    bool double_reg2;
    unsigned array_id;
+   bool is_double_vertex_input;
 };
 
 class st_dst_reg {
@@ -224,6 +229,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->has_index2 = reg.has_index2;
    this->double_reg2 = false;
    this->array_id = reg.array_id;
+   this->is_double_vertex_input = false;
 }
 
 st_dst_reg::st_dst_reg(st_src_reg reg)
@@ -2370,6 +2376,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
    this->result = st_src_reg(entry->file, entry->index, var->type);
    this->result.array_id = entry->array_id;
+   if (this->shader->Stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in && var->type->is_double())
+      this->result.is_double_vertex_input = true;
    if (!native_integers)
       this->result.type = GLSL_TYPE_FLOAT;
 }
@@ -2714,7 +2722,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
    if (type->is_matrix()) {
       const struct glsl_type *vec_type;
 
-      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
+      vec_type = glsl_type::get_instance(type->is_double() ? GLSL_TYPE_DOUBLE : GLSL_TYPE_FLOAT,
                                          type->vector_elements, 1);
 
       for (int i = 0; i < type->matrix_columns; i++) {
@@ -2744,6 +2752,11 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
    }
    l->index++;
    r->index++;
+   if (type->is_dual_slot_double()) {
+      l->index++;
+      if (r->is_double_vertex_input == false)
+	 r->index++;
+   }
 }
 
 void

From 36db91c4c44e8546d80d3ca246853c738086625d Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Tue, 29 Dec 2015 14:24:09 -0800
Subject: [PATCH 024/241] mesa: Add MESA_VERBOSE=api for several indexed
 BindBuffer variants

v2:
 * Add braces '{}' when the _mesa_debug call spans multiple lines (Ken)

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/main/bufferobj.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index e0639c87ef4..181eb49d408 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -1205,9 +1205,10 @@ _mesa_BindBuffer(GLenum target, GLuint buffer)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (MESA_VERBOSE & VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API) {
       _mesa_debug(ctx, "glBindBuffer(%s, %u)\n",
                   _mesa_enum_to_string(target), buffer);
+   }
 
    bind_buffer_object(ctx, target, buffer);
 }
@@ -1562,12 +1563,13 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
 {
    bool valid_usage;
 
-   if (MESA_VERBOSE & VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API) {
       _mesa_debug(ctx, "%s(%s, %ld, %p, %s)\n",
                   func,
                   _mesa_enum_to_string(target),
                   (long int) size, data,
                   _mesa_enum_to_string(usage));
+   }
 
    if (size < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size < 0)", func);
@@ -3911,6 +3913,11 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBufferRange(%s, %u, %u, %d, %d)\n",
+                  _mesa_enum_to_string(target), index, buffer, offset, size);
+   }
+
    if (buffer == 0) {
       bufObj = ctx->Shared->NullBufferObj;
    } else {
@@ -3963,6 +3970,11 @@ _mesa_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
    GET_CURRENT_CONTEXT(ctx);
    struct gl_buffer_object *bufObj;
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBufferBase(%s, %u, %u)\n",
+                  _mesa_enum_to_string(target), index, buffer);
+   }
+
    if (buffer == 0) {
       bufObj = ctx->Shared->NullBufferObj;
    } else {
@@ -4033,6 +4045,12 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBuffersRange(%s, %u, %d, %p, %p, %p)\n",
+                  _mesa_enum_to_string(target), first, count,
+                  buffers, offsets, sizes);
+   }
+
    switch (target) {
    case GL_TRANSFORM_FEEDBACK_BUFFER:
       bind_xfb_buffers_range(ctx, first, count, buffers, offsets, sizes);
@@ -4061,6 +4079,11 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBuffersBase(%s, %u, %d, %p)\n",
+                  _mesa_enum_to_string(target), first, count, buffers);
+   }
+
    switch (target) {
    case GL_TRANSFORM_FEEDBACK_BUFFER:
       bind_xfb_buffers_base(ctx, first, count, buffers);

From 3dce7bf26874350264a1d567234ae1d785a08127 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Wed, 30 Dec 2015 13:08:07 -0800
Subject: [PATCH 025/241] mesa: Add MESA_VERBOSE=api for
 GL_ARB_program_interface_query

v2:
 * Add braces '{}' when the _mesa_debug call spans multiple lines (Ken)

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/main/program_resource.c | 39 ++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index b7e25fe3840..9a15cfe70b8 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -70,6 +70,13 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
                             GLenum pname, GLint *params)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramInterfaceiv(%u, %s, %s, %p)\n",
+                  program, _mesa_enum_to_string(programInterface),
+                  _mesa_enum_to_string(pname), params);
+   }
+
    unsigned i;
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
@@ -226,6 +233,12 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                               const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceIndex(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
    unsigned array_index = 0;
    struct gl_program_resource *res;
    struct gl_shader_program *shProg =
@@ -290,6 +303,13 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
                              GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceName(%u, %s, %u, %d, %p, %p)\n",
+                  program, _mesa_enum_to_string(programInterface), index,
+                  bufSize, length, name);
+   }
+
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
                                       "glGetProgramResourceName");
@@ -315,6 +335,13 @@ _mesa_GetProgramResourceiv(GLuint program, GLenum programInterface,
                            GLsizei *length, GLint *params)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceiv(%u, %s, %u, %d, %p, %d, %p, %p)\n",
+                  program, _mesa_enum_to_string(programInterface), index,
+                  propCount, props, bufSize, length, params);
+   }
+
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program, "glGetProgramResourceiv");
 
@@ -361,6 +388,12 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
                                  const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceLocation(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocation");
 
@@ -411,6 +444,12 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
                                       const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceLocationIndex(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocationIndex");
 

From 816ddee6b8c45648bd24cfd6b4bbf1a2ad6fedb0 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 1 Jan 2016 12:52:22 -0500
Subject: [PATCH 026/241] nir/lower_clip: add missing writemask on store

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/glsl/nir/nir_lower_clip.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glsl/nir/nir_lower_clip.c b/src/glsl/nir/nir_lower_clip.c
index f84a02410a8..0ca6a289396 100644
--- a/src/glsl/nir/nir_lower_clip.c
+++ b/src/glsl/nir/nir_lower_clip.c
@@ -72,6 +72,7 @@ store_clipdist_output(nir_builder *b, nir_variable *out, nir_ssa_def **val)
    store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
    store->num_components = 4;
    store->const_index[0] = out->data.driver_location;
+   store->const_index[1] = 0xf;   /* wrmask */
    store->src[0].ssa = nir_vec4(b, val[0], val[1], val[2], val[3]);
    store->src[0].is_ssa = true;
    store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));

From 33a415310b2d958319a4391f6b6d325d9c7292cc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 16:22:40 -0500
Subject: [PATCH 027/241] st/mesa: sort extensions enablement array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_extensions.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index a0168d8e408..90eb67711f3 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -440,11 +440,15 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
       { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
       { o(ARB_clear_texture),                PIPE_CAP_CLEAR_TEXTURE                    },
+      { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
       { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
+      { o(ARB_conditional_render_inverted),  PIPE_CAP_CONDITIONAL_RENDER_INVERTED      },
       { o(ARB_copy_image),                   PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
       { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_derivative_control),           PIPE_CAP_TGSI_FS_FINE_DERIVATIVE          },
       { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
+      { o(ARB_draw_indirect),                PIPE_CAP_DRAW_INDIRECT                    },
       { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
       { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
@@ -453,6 +457,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
       { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
+      { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
       { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
       { o(ARB_shader_draw_parameters),       PIPE_CAP_DRAW_PARAMETERS                  },
       { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
@@ -460,15 +465,21 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
       { o(ARB_shadow),                       PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_texture_buffer_object),        PIPE_CAP_TEXTURE_BUFFER_OBJECTS           },
+      { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
       { o(ARB_texture_gather),               PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS    },
       { o(ARB_texture_mirror_clamp_to_edge), PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
+      { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
       { o(ARB_texture_non_power_of_two),     PIPE_CAP_NPOT_TEXTURES                    },
+      { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
+      { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
       { o(ARB_timer_query),                  PIPE_CAP_QUERY_TIMESTAMP                  },
       { o(ARB_transform_feedback2),          PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME       },
       { o(ARB_transform_feedback3),          PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME       },
 
       { o(EXT_blend_equation_separate),      PIPE_CAP_BLEND_EQUATION_SEPARATE          },
+      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
       { o(EXT_draw_buffers2),                PIPE_CAP_INDEP_BLEND_ENABLE               },
+      { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
       { o(EXT_stencil_two_side),             PIPE_CAP_TWO_SIDED_STENCIL                },
       { o(EXT_texture_array),                PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS         },
       { o(EXT_texture_filter_anisotropic),   PIPE_CAP_ANISOTROPIC_FILTER               },
@@ -489,17 +500,6 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(OES_standard_derivatives),         PIPE_CAP_SM3                              },
       { o(OES_texture_float_linear),         PIPE_CAP_TEXTURE_FLOAT_LINEAR             },
       { o(OES_texture_half_float_linear),    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR        },
-      { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
-      { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
-      { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
-      { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
-      { o(ARB_draw_indirect),                PIPE_CAP_DRAW_INDIRECT                    },
-      { o(ARB_derivative_control),           PIPE_CAP_TGSI_FS_FINE_DERIVATIVE          },
-      { o(ARB_conditional_render_inverted),  PIPE_CAP_CONDITIONAL_RENDER_INVERTED      },
-      { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
-      { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
-      { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
-      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
    };
 
    /* Required: render target and sampler support */

From c1d14c6817e3fa9a1c04f9b6c51b4ca601637843 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 20:33:15 -0500
Subject: [PATCH 028/241] nv50,nvc0: make sure there's pushbuf space and that
 we ref the bo early

First off, we can't flush in the middle of a command. Secondly
requesting the extra push space might cause a flush to happen. If that
flush happens, we'd have to do the PUSH_REFN again. So instead do
PUSH_REFN after the push space request. This helps avoid rare crashes
with supertuxkart in libdrm due to assertion failures.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c          | 2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c     | 1 -
 src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c          | 7 +++----
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 7de2f1f1d0f..2d1aa6abcd2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,8 +636,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
          PUSH_DATA (push, prim);
 
-         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
          nouveau_pushbuf_space(push, 8, 0, 1);
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
 
          switch (index_size) {
          case 4:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index a70d524ea85..8021a65dc46 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -473,7 +473,6 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
 
    PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                         NVC0_IB_ENTRY_1_NO_PREFETCH);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 5e84ca9e0ea..dc02b011bdf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -317,6 +317,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
 
       if (!targ->clean)
          nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+      nouveau_pushbuf_space(push, 0, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
       PUSH_DATA (push, 1);
       PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 273451e638d..235b1afc24b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -787,7 +787,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
    }
 
    while (num_instances--) {
-      PUSH_SPACE(push, 8);
+      nouveau_pushbuf_space(push, 9, 0, 1);
       BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
       PUSH_DATA (push, mode);
       BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
@@ -822,7 +822,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
    PUSH_DATA (push, 256 + 128);
 
-   PUSH_SPACE(push, 8);
+   nouveau_pushbuf_space(push, 8, 0, 1);
+   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
       assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
@@ -840,8 +841,6 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    }
    PUSH_DATA(push, nvc0_prim_gl(info->mode));
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-   nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push,
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }

From a2942d8f2696bdc2a98989f04275b497996d5478 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Fri, 1 Jan 2016 16:58:49 -0800
Subject: [PATCH 029/241] mesa: Fix warning with MESA_VERBOSE=api for
 BindBufferRange
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported-by: Dieter Nützel <Dieter@nuetzel-hh.de>
Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/bufferobj.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 181eb49d408..3a05cd55042 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3914,7 +3914,7 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
    struct gl_buffer_object *bufObj;
 
    if (MESA_VERBOSE & VERBOSE_API) {
-      _mesa_debug(ctx, "glBindBufferRange(%s, %u, %u, %d, %d)\n",
+      _mesa_debug(ctx, "glBindBufferRange(%s, %u, %u, %ld, %ld)\n",
                   _mesa_enum_to_string(target), index, buffer, offset, size);
    }
 

From 294ed5cd13e878ec43126a2070343d6d99ef5669 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 6 Dec 2015 13:31:25 +0100
Subject: [PATCH 030/241] program: add _mesa_reserve_parameter_storage

The next commit will use this.

Reviewed-by: Brian Paul <brianp@vmware.com>
Cc: 11.0 11.1 <mesa-stable@lists.freedesktop.org>
---
 src/mesa/program/prog_parameter.c | 47 +++++++++++++++++++++----------
 src/mesa/program/prog_parameter.h |  4 +++
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 53e9813e6fd..e98946b9387 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -88,6 +88,37 @@ _mesa_free_parameter_list(struct gl_program_parameter_list *paramList)
 }
 
 
+/**
+ * Make sure there are enough unused parameter slots. Reallocate the list
+ * if needed.
+ *
+ * \param paramList        where to reserve parameter slots
+ * \param reserve_slots    number of slots to reserve
+ */
+void
+_mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
+                                unsigned reserve_slots)
+{
+   const GLuint oldNum = paramList->NumParameters;
+
+   if (oldNum + reserve_slots > paramList->Size) {
+      /* Need to grow the parameter list array (alloc some extra) */
+      paramList->Size = paramList->Size + 4 * reserve_slots;
+
+      /* realloc arrays */
+      paramList->Parameters =
+         realloc(paramList->Parameters,
+                 paramList->Size * sizeof(struct gl_program_parameter));
+
+      paramList->ParameterValues = (gl_constant_value (*)[4])
+         _mesa_align_realloc(paramList->ParameterValues,         /* old buf */
+                             oldNum * 4 * sizeof(gl_constant_value),/* old sz */
+                             paramList->Size*4*sizeof(gl_constant_value),/*new*/
+                             16);
+   }
+}
+
+
 /**
  * Add a new parameter to a parameter list.
  * Note that parameter values are usually 4-element GLfloat vectors.
@@ -115,21 +146,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
 
    assert(size > 0);
 
-   if (oldNum + sz4 > paramList->Size) {
-      /* Need to grow the parameter list array (alloc some extra) */
-      paramList->Size = paramList->Size + 4 * sz4;
-
-      /* realloc arrays */
-      paramList->Parameters =
-         realloc(paramList->Parameters,
-                 paramList->Size * sizeof(struct gl_program_parameter));
-
-      paramList->ParameterValues = (gl_constant_value (*)[4])
-         _mesa_align_realloc(paramList->ParameterValues,         /* old buf */
-                             oldNum * 4 * sizeof(gl_constant_value),/* old sz */
-                             paramList->Size*4*sizeof(gl_constant_value),/*new*/
-                             16);
-   }
+   _mesa_reserve_parameter_storage(paramList, sz4);
 
    if (!paramList->Parameters ||
        !paramList->ParameterValues) {
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index 74a5fd91804..44700b710d7 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -112,6 +112,10 @@ _mesa_num_parameters(const struct gl_program_parameter_list *list)
    return list ? list->NumParameters : 0;
 }
 
+extern void
+_mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
+                                unsigned reserve_slots);
+
 extern GLint
 _mesa_add_parameter(struct gl_program_parameter_list *paramList,
                     gl_register_file type, const char *name,

From 36c93a6fae275614b6004ec5ab085774d527e1bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 6 Dec 2015 13:36:57 +0100
Subject: [PATCH 031/241] st/mesa: fix GLSL uniform updates for glBitmap &
 glDrawPixels (v2)

Spotted by luck. The GLSL uniform storage is only associated once
in LinkShader and can't be reallocated afterwards, because that would
break the association.

v2: don't remove st_upload_constants calls, clarify why they're needed

Cc: 11.0 11.1 <mesa-stable@lists.freedesktop.org>
---
 src/mesa/state_tracker/st_cb_bitmap.c      |  6 +++++-
 src/mesa/state_tracker/st_cb_drawpixels.c  | 14 ++++++++++----
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  6 ++++++
 src/mesa/state_tracker/st_program.c        | 17 ++++-------------
 src/mesa/state_tracker/st_program.h        |  1 -
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index cbc6845d771..a4a48a616fd 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -287,7 +287,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       GLfloat colorSave[4];
       COPY_4V(colorSave, ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], color);
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
       COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
    }
 
@@ -404,6 +405,9 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_stream_outputs(cso);
 
    pipe_resource_reference(&vbuf, NULL);
+
+   /* We uploaded modified constants, need to invalidate them. */
+   st->dirty.mesa |= _NEW_PROGRAM_CONSTANTS;
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 262ad809c58..a125d1f149e 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -1110,8 +1110,11 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
          num_sampler_view++;
       }
 
-      /* update fragment program constants */
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* compiling a new fragment shader variant added new state constants
+       * into the constant buffer, we need to update them
+       */
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
    }
 
    /* Put glDrawPixels image into a texture */
@@ -1463,8 +1466,11 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
          num_sampler_view++;
       }
 
-      /* update fragment program constants */
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* compiling a new fragment shader variant added new state constants
+       * into the constant buffer, we need to update them
+       */
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
    }
    else {
       assert(type == GL_DEPTH);
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index ad3a6846558..cdbe2f4f8a8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5688,6 +5688,12 @@ get_mesa_program(struct gl_context *ctx,
 
    _mesa_reference_program(ctx, &shader->Program, prog);
 
+   /* Avoid reallocation of the program parameter list, because the uniform
+    * storage is only associated with the original parameter list.
+    * This should be enough for Bitmap and DrawPixels constants.
+    */
+   _mesa_reserve_parameter_storage(prog->Parameters, 8);
+
    /* This has to be done last.  Any operation the can cause
     * prog->ParameterValues to get reallocated (e.g., anything that adds a
     * program constant) has to happen before creating this linkage.
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 75ccaf2f26b..39c54c256e0 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -112,8 +112,6 @@ delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv)
 {
    if (fpv->driver_shader) 
       cso_delete_fragment_shader(st->cso_context, fpv->driver_shader);
-   if (fpv->parameters)
-      _mesa_free_parameter_list(fpv->parameters);
    free(fpv);
 }
 
@@ -914,8 +912,6 @@ st_create_fp_variant(struct st_context *st,
          if (tgsi.tokens != stfp->tgsi.tokens)
             tgsi_free_tokens(tgsi.tokens);
          tgsi.tokens = tokens;
-         variant->parameters =
-            _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
       } else
          fprintf(stderr, "mesa: cannot create a shader for glBitmap\n");
    }
@@ -924,6 +920,7 @@ st_create_fp_variant(struct st_context *st,
    if (key->drawpixels) {
       const struct tgsi_token *tokens;
       unsigned scale_const = 0, bias_const = 0, texcoord_const = 0;
+      struct gl_program_parameter_list *params = stfp->Base.Base.Parameters;
 
       /* Find the first unused slot. */
       variant->drawpix_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
@@ -935,27 +932,21 @@ st_create_fp_variant(struct st_context *st,
          variant->pixelmap_sampler = ffs(~samplers_used) - 1;
       }
 
-      variant->parameters =
-         _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
-
       if (key->scaleAndBias) {
          static const gl_state_index scale_state[STATE_LENGTH] =
             { STATE_INTERNAL, STATE_PT_SCALE };
          static const gl_state_index bias_state[STATE_LENGTH] =
             { STATE_INTERNAL, STATE_PT_BIAS };
 
-         scale_const = _mesa_add_state_reference(variant->parameters,
-                                                 scale_state);
-         bias_const = _mesa_add_state_reference(variant->parameters,
-                                                bias_state);
+         scale_const = _mesa_add_state_reference(params, scale_state);
+         bias_const = _mesa_add_state_reference(params, bias_state);
       }
 
       {
          static const gl_state_index state[STATE_LENGTH] =
             { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 };
 
-         texcoord_const = _mesa_add_state_reference(variant->parameters,
-                                                    state);
+         texcoord_const = _mesa_add_state_reference(params, state);
       }
 
       tokens = st_get_drawpix_shader(tgsi.tokens,
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index d9b53ac008c..a8571f0c441 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -80,7 +80,6 @@ struct st_fp_variant
    void *driver_shader;
 
    /** For glBitmap variants */
-   struct gl_program_parameter_list *parameters;
    uint bitmap_sampler;
 
    /** For glDrawPixels variants */

From ffc4716e9730ca162ce5dfcf0298125269c6d908 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 19 Dec 2015 16:44:52 +0100
Subject: [PATCH 032/241] u_upload_mgr: rework the application of alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function only aligned the size, but not the offset.
The offset was aligned only when the previous suballocation was aligned.
That yielded the correct offset alignment if the alignment was constant
for all suballocations.

Instead, directly align the offset, but allow an unaligned size.
There is no change in behavior, because the alignment is constant
at the moment.

This a prerequisite for allowing a variable alignment for suballocations.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/util/u_upload_mgr.c | 24 +++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index b672fad6bf0..4148bae5ab5 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -185,15 +185,20 @@ u_upload_alloc(struct u_upload_mgr *upload,
                struct pipe_resource **outbuf,
                void **ptr)
 {
-   unsigned alloc_size = align(size, upload->alignment);
-   unsigned alloc_offset = align(min_out_offset, upload->alignment);
+   unsigned alignment = upload->alignment;
    unsigned buffer_size = upload->buffer ? upload->buffer->width0 : 0;
    unsigned offset;
 
+   min_out_offset = align(min_out_offset, alignment);
+
+   offset = align(upload->offset, alignment);
+   offset = MAX2(offset, min_out_offset);
+
    /* Make sure we have enough space in the upload buffer
-    * for the sub-allocation. */
-   if (unlikely(MAX2(upload->offset, alloc_offset) + alloc_size > buffer_size)) {
-      u_upload_alloc_buffer(upload, alloc_offset + alloc_size);
+    * for the sub-allocation.
+    */
+   if (unlikely(!upload->buffer || offset + size > buffer_size)) {
+      u_upload_alloc_buffer(upload, min_out_offset + size);
 
       if (unlikely(!upload->buffer)) {
          *out_offset = ~0;
@@ -202,11 +207,10 @@ u_upload_alloc(struct u_upload_mgr *upload,
          return;
       }
 
+      offset = min_out_offset;
       buffer_size = upload->buffer->width0;
    }
 
-   offset = MAX2(upload->offset, alloc_offset);
-
    if (unlikely(!upload->map)) {
       upload->map = pipe_buffer_map_range(upload->pipe, upload->buffer,
                                           offset,
@@ -224,8 +228,8 @@ u_upload_alloc(struct u_upload_mgr *upload,
       upload->map -= offset;
    }
 
-   assert(offset < upload->buffer->width0);
-   assert(offset + size <= upload->buffer->width0);
+   assert(offset < buffer_size);
+   assert(offset + size <= buffer_size);
    assert(size);
 
    /* Emit the return values: */
@@ -233,7 +237,7 @@ u_upload_alloc(struct u_upload_mgr *upload,
    pipe_resource_reference(outbuf, upload->buffer);
    *out_offset = offset;
 
-   upload->offset = offset + alloc_size;
+   upload->offset = offset + size;
 }
 
 void u_upload_data(struct u_upload_mgr *upload,

From 020009f7ccdffa84c6e1649c4e915954f5fd7cc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 19 Dec 2015 17:15:02 +0100
Subject: [PATCH 033/241] u_upload_mgr: pass alignment to u_upload_alloc
 manually
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fixed alignment of u_upload_mgr will go away.
This is the first step.

The motivation is that one u_upload_mgr can have multiple users,
each allocating from the same buffer, but requiring a different alignment.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c           | 2 +-
 src/gallium/auxiliary/indices/u_primconvert.c     | 2 +-
 src/gallium/auxiliary/util/u_upload_mgr.c         | 4 ++--
 src/gallium/auxiliary/util/u_upload_mgr.h         | 2 ++
 src/gallium/auxiliary/util/u_vbuf.c               | 4 ++--
 src/gallium/auxiliary/vl/vl_compositor.c          | 1 +
 src/gallium/drivers/freedreno/a3xx/fd3_context.c  | 2 +-
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c     | 3 ++-
 src/gallium/drivers/freedreno/a4xx/fd4_context.c  | 2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c     | 3 ++-
 src/gallium/drivers/freedreno/freedreno_context.h | 2 ++
 src/gallium/drivers/r300/r300_render_translate.c  | 6 +++---
 src/gallium/drivers/r600/r600_state_common.c      | 2 +-
 src/gallium/drivers/radeon/r600_buffer_common.c   | 2 +-
 src/gallium/drivers/radeon/r600_pipe_common.c     | 2 +-
 src/gallium/drivers/radeonsi/si_descriptors.c     | 6 +++---
 src/gallium/drivers/radeonsi/si_state_draw.c      | 2 +-
 src/gallium/drivers/svga/svga_context.c           | 1 -
 src/gallium/drivers/svga/svga_context.h           | 2 ++
 src/gallium/drivers/svga/svga_state_constants.c   | 3 ++-
 src/gallium/drivers/vc4/vc4_resource.c            | 2 +-
 src/mesa/state_tracker/st_cb_bitmap.c             | 2 +-
 src/mesa/state_tracker/st_cb_clear.c              | 2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c         | 2 +-
 src/mesa/state_tracker/st_cb_drawtex.c            | 2 +-
 25 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index c5c33327702..691de81f20a 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -431,7 +431,7 @@ hud_alloc_vertices(struct hud_context *hud, struct vertex_queue *v,
    v->max_num_vertices = num_vertices;
    v->vbuf.stride = stride;
    u_upload_alloc(hud->uploader, 0, v->vbuf.stride * v->max_num_vertices,
-                  &v->vbuf.buffer_offset, &v->vbuf.buffer,
+                  16, &v->vbuf.buffer_offset, &v->vbuf.buffer,
                   (void**)&v->vertices);
 }
 
diff --git a/src/gallium/auxiliary/indices/u_primconvert.c b/src/gallium/auxiliary/indices/u_primconvert.c
index 70d3e8530b8..c0a31548433 100644
--- a/src/gallium/auxiliary/indices/u_primconvert.c
+++ b/src/gallium/auxiliary/indices/u_primconvert.c
@@ -156,7 +156,7 @@ util_primconvert_draw_vbo(struct primconvert_context *pc,
       pc->upload = u_upload_create(pc->pipe, 4096, 4, PIPE_BIND_INDEX_BUFFER);
    }
 
-   u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count,
+   u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count, 4,
                   &new_ib.offset, &new_ib.buffer, &dst);
 
    if (info->indexed) {
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 4148bae5ab5..3f790400e40 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -181,11 +181,11 @@ void
 u_upload_alloc(struct u_upload_mgr *upload,
                unsigned min_out_offset,
                unsigned size,
+               unsigned alignment,
                unsigned *out_offset,
                struct pipe_resource **outbuf,
                void **ptr)
 {
-   unsigned alignment = upload->alignment;
    unsigned buffer_size = upload->buffer ? upload->buffer->width0 : 0;
    unsigned offset;
 
@@ -249,7 +249,7 @@ void u_upload_data(struct u_upload_mgr *upload,
 {
    uint8_t *ptr;
 
-   u_upload_alloc(upload, min_out_offset, size,
+   u_upload_alloc(upload, min_out_offset, size, upload->alignment,
                   out_offset, outbuf,
                   (void**)&ptr);
    if (ptr)
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 67c6daa4e7f..ad7135fc8da 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -74,6 +74,7 @@ void u_upload_unmap( struct u_upload_mgr *upload );
  * \param upload           Upload manager
  * \param min_out_offset   Minimum offset that should be returned in out_offset.
  * \param size             Size of the allocation.
+ * \param alignment        Alignment of the suballocation within the buffer
  * \param out_offset       Pointer to where the new buffer offset will be returned.
  * \param outbuf           Pointer to where the upload buffer will be returned.
  * \param ptr              Pointer to the allocated memory that is returned.
@@ -81,6 +82,7 @@ void u_upload_unmap( struct u_upload_mgr *upload );
 void u_upload_alloc(struct u_upload_mgr *upload,
                     unsigned min_out_offset,
                     unsigned size,
+                    unsigned alignment,
                     unsigned *out_offset,
                     struct pipe_resource **outbuf,
                     void **ptr);
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 54e9e717104..dd64e2d7949 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -454,7 +454,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 
       /* Create and map the output buffer. */
       u_upload_alloc(mgr->uploader, 0,
-                     key->output_stride * num_indices,
+                     key->output_stride * num_indices, 4,
                      &out_offset, &out_buffer,
                      (void**)&out_map);
       if (!out_buffer)
@@ -487,7 +487,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
       /* Create and map the output buffer. */
       u_upload_alloc(mgr->uploader,
                      key->output_stride * start_vertex,
-                     key->output_stride * num_vertices,
+                     key->output_stride * num_vertices, 4,
                      &out_offset, &out_buffer,
                      (void**)&out_map);
       if (!out_buffer)
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index afe53063b48..f160df63aa5 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -716,6 +716,7 @@ gen_vertex_data(struct vl_compositor *c, struct vl_compositor_state *s, struct u
    /* Allocate new memory for vertices. */
    u_upload_alloc(c->upload, 0,
                   c->vertex_buf.stride * VL_COMPOSITOR_MAX_LAYERS * 4, /* size */
+                  4, /* alignment */
                   &c->vertex_buf.buffer_offset, &c->vertex_buf.buffer,
                   (void**)&vb);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 74cbbf2edd8..2413f152f94 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -172,7 +172,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 	fd3_query_context_init(pctx);
 
 	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+			BORDER_COLOR_UPLOAD_SIZE, 0);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 24afbc9e956..e65a352e7f6 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -145,7 +145,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	void *ptr;
 
 	u_upload_alloc(fd3_ctx->border_color_uploader,
-			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			0, BORDER_COLOR_UPLOAD_SIZE,
+		       BORDER_COLOR_UPLOAD_SIZE, &off,
 			&fd3_ctx->border_color_buf,
 			&ptr);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index e53e0c56c9a..1037adfebf8 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -172,7 +172,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 	fd4_query_context_init(pctx);
 
 	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+			BORDER_COLOR_UPLOAD_SIZE, 0);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index b9a28149722..bc62a5d9a4b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -133,7 +133,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	void *ptr;
 
 	u_upload_alloc(fd4_ctx->border_color_uploader,
-			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			0, BORDER_COLOR_UPLOAD_SIZE,
+		       BORDER_COLOR_UPLOAD_SIZE, &off,
 			&fd4_ctx->border_color_buf,
 			&ptr);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 571c8142bf7..418b71b95de 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -40,6 +40,8 @@
 #include "freedreno_gmem.h"
 #include "freedreno_util.h"
 
+#define BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE)
+
 struct fd_vertex_stateobj;
 
 struct fd_texture_stateobj {
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c
index caeeec05909..7221211deea 100644
--- a/src/gallium/drivers/r300/r300_render_translate.c
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -37,7 +37,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     switch (*index_size) {
     case 1:
         *out_buffer = NULL;
-        u_upload_alloc(r300->uploader, 0, count * 2,
+        u_upload_alloc(r300->uploader, 0, count * 2, 4,
                        &out_offset, out_buffer, &ptr);
 
         util_shorten_ubyte_elts_to_userptr(
@@ -51,7 +51,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     case 2:
         if (index_offset) {
             *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 2,
+            u_upload_alloc(r300->uploader, 0, count * 2, 4,
                            &out_offset, out_buffer, &ptr);
 
             util_rebuild_ushort_elts_to_userptr(&r300->context, ib,
@@ -65,7 +65,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
     case 4:
         if (index_offset) {
             *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 4,
+            u_upload_alloc(r300->uploader, 0, count * 4, 4,
                            &out_offset, out_buffer, &ptr);
 
             util_rebuild_uint_elts_to_userptr(&r300->context, ib,
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index ca589fa7759..3051c9af09c 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1732,7 +1732,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 				}
 			}
 
-			u_upload_alloc(rctx->b.uploader, start, count * 2,
+			u_upload_alloc(rctx->b.uploader, start, count * 2, 256,
 				       &out_offset, &out_buffer, &ptr);
 
 			util_shorten_ubyte_elts_to_userptr(
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 18925277d2d..484f5c8d5b7 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -298,7 +298,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 			struct r600_resource *staging = NULL;
 
 			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
-				       &offset, (struct pipe_resource**)&staging, (void**)&data);
+				       256, &offset, (struct pipe_resource**)&staging, (void**)&data);
 
 			if (staging) {
 				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 9a5e9878176..c044b6130a2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -85,7 +85,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	/* Upload vertices. The hw rectangle has only 3 vertices,
 	 * I guess the 4th one is derived from the first 3.
 	 * The vertex specification should match u_blitter's vertex element state. */
-	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb);
+	u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
 	if (!buf)
 		return;
 
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index b3719dea252..5b0ad8f5622 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -109,7 +109,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 	if (!desc->list_dirty)
 		return true;
 
-	u_upload_alloc(sctx->b.uploader, 0, list_size,
+	u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
 		       &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, &ptr);
 	if (!desc->buffer)
@@ -391,7 +391,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	 * directly through a staging buffer and don't go through
 	 * the fine-grained upload path.
 	 */
-	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+	u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
 	if (!desc->buffer)
 		return false;
@@ -465,7 +465,7 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf
 {
 	void *tmp;
 
-	u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
+	u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
 		       (struct pipe_resource**)rbuffer, &tmp);
 	if (rbuffer)
 		util_memcpy_cpu_to_le32(tmp, ptr, size);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e5500111f43..d5540bec71d 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -818,7 +818,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			si_get_draw_start_count(sctx, info, &start, &count);
 			start_offset = start * ib.index_size;
 
-			u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
+			u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
 				       &out_offset, &out_buffer, &ptr);
 			if (!out_buffer) {
 				pipe_resource_reference(&ib.buffer, NULL);
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index d407785ddd9..97e649e38ba 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -46,7 +46,6 @@
 #include "svga_winsys.h"
 
 #define CONST0_UPLOAD_DEFAULT_SIZE 65536
-#define CONST0_UPLOAD_ALIGNMENT 256
 
 DEBUG_GET_ONCE_BOOL_OPTION(no_swtnl, "SVGA_NO_SWTNL", FALSE)
 DEBUG_GET_ONCE_BOOL_OPTION(force_swtnl, "SVGA_FORCE_SWTNL", FALSE);
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 78e346a92b9..c282932cb18 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -74,6 +74,8 @@
  */
 #define SVGA_MAX_CONST_BUF_SIZE (4096 * 4 * sizeof(int))
 
+#define CONST0_UPLOAD_ALIGNMENT 256
+
 struct draw_vertex_shader;
 struct draw_fragment_shader;
 struct svga_shader_variant;
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 2cf41134bd6..8ab1693088a 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -613,7 +613,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
     */
    new_buf_size = align(new_buf_size, 16);
 
-   u_upload_alloc(svga->const0_upload, 0, new_buf_size, &offset,
+   u_upload_alloc(svga->const0_upload, 0, new_buf_size,
+                  CONST0_UPLOAD_ALIGNMENT, &offset,
                   &dst_buffer, &dst_map);
    if (!dst_map) {
       if (src_map)
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 9e6678a0625..308fb9fc77b 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -921,7 +921,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
 
         void *data;
         struct pipe_resource *shadow_rsc = NULL;
-        u_upload_alloc(vc4->uploader, 0, count * 2,
+        u_upload_alloc(vc4->uploader, 0, count * 2, 4,;
                        shadow_offset, &shadow_rsc, &data);
         uint16_t *dst = data;
 
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index a4a48a616fd..14e8354d480 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -204,7 +204,7 @@ setup_bitmap_vertex_data(struct st_context *st, bool normalized,
       tBot = (GLfloat) height;
    }
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
                   vbuf_offset, vbuf, (void **) &vertices);
    if (!*vbuf) {
       return;
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 18efd14a57c..e09f5ec6a0b 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -184,7 +184,7 @@ draw_quad(struct st_context *st,
 
    vb.stride = 8 * sizeof(float);
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
                   &vb.buffer_offset, &vb.buffer,
                   (void **) &vertices);
    if (!vb.buffer) {
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index a125d1f149e..86e8a55e25e 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -457,7 +457,7 @@ draw_quad(struct gl_context *ctx, GLfloat x0, GLfloat y0, GLfloat z,
    struct pipe_resource *buf = NULL;
    unsigned offset;
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), &offset,
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), 4, &offset,
                   &buf, (void **) &verts);
    if (!buf) {
       return;
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index 2634b09d777..b3e4b5bb70c 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -150,7 +150,7 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
       GLuint attr;
 
       u_upload_alloc(st->uploader, 0,
-                     numAttribs * 4 * 4 * sizeof(GLfloat),
+                     numAttribs * 4 * 4 * sizeof(GLfloat), 4,
                      &offset, &vbuffer, (void **) &vbuf);
       if (!vbuffer) {
          return;

From e0f932846c91e6eca6495c86b72e87af64dfefb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 19 Dec 2015 17:15:02 +0100
Subject: [PATCH 034/241] u_upload_mgr: pass alignment to u_upload_data
 manually
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/util/u_blitter.c        | 4 ++--
 src/gallium/auxiliary/util/u_upload_mgr.c     | 6 ++++--
 src/gallium/auxiliary/util/u_upload_mgr.h     | 1 +
 src/gallium/auxiliary/util/u_vbuf.c           | 2 +-
 src/gallium/drivers/ilo/ilo_gpgpu.c           | 2 +-
 src/gallium/drivers/ilo/ilo_state.c           | 4 ++--
 src/gallium/drivers/r300/r300_render.c        | 2 +-
 src/gallium/drivers/r300/r300_screen_buffer.c | 2 +-
 src/gallium/drivers/r600/r600_state_common.c  | 6 +++---
 src/gallium/drivers/radeonsi/si_state_draw.c  | 2 +-
 src/gallium/drivers/vc4/vc4_draw.c            | 2 +-
 src/gallium/drivers/virgl/virgl_context.c     | 2 +-
 src/gallium/state_trackers/nine/device9.c     | 8 +++++---
 src/gallium/state_trackers/nine/device9.h     | 1 +
 src/gallium/state_trackers/nine/nine_ff.c     | 2 ++
 src/gallium/state_trackers/nine/nine_state.c  | 2 ++
 src/mesa/state_tracker/st_atom_constbuf.c     | 1 +
 src/mesa/state_tracker/st_draw.c              | 2 +-
 src/mesa/state_tracker/st_extensions.c        | 5 +++--
 19 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 05b4567130e..a72d77df301 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -1191,7 +1191,7 @@ static void blitter_draw(struct blitter_context_priv *ctx,
 
    vb.stride = 8 * sizeof(float);
 
-   u_upload_data(ctx->upload, 0, sizeof(ctx->vertices), ctx->vertices,
+   u_upload_data(ctx->upload, 0, sizeof(ctx->vertices), 4, ctx->vertices,
                  &vb.buffer_offset, &vb.buffer);
    if (!vb.buffer)
       return;
@@ -2111,7 +2111,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
       return;
    }
 
-   u_upload_data(ctx->upload, 0, num_channels*4, clear_value,
+   u_upload_data(ctx->upload, 0, num_channels*4, 4, clear_value,
                  &vb.buffer_offset, &vb.buffer);
    if (!vb.buffer)
       goto out;
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 3f790400e40..646965c4070 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -243,13 +243,14 @@ u_upload_alloc(struct u_upload_mgr *upload,
 void u_upload_data(struct u_upload_mgr *upload,
                    unsigned min_out_offset,
                    unsigned size,
+                   unsigned alignment,
                    const void *data,
                    unsigned *out_offset,
                    struct pipe_resource **outbuf)
 {
    uint8_t *ptr;
 
-   u_upload_alloc(upload, min_out_offset, size, upload->alignment,
+   u_upload_alloc(upload, min_out_offset, size, alignment,
                   out_offset, outbuf,
                   (void**)&ptr);
    if (ptr)
@@ -282,6 +283,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
    if (0)
       debug_printf("upload ptr %p ofs %d sz %d\n", map, offset, size);
 
-   u_upload_data(upload, min_out_offset, size, map, out_offset, outbuf);
+   u_upload_data(upload, min_out_offset, size, upload->alignment,
+                 map, out_offset, outbuf);
    pipe_buffer_unmap( upload->pipe, transfer );
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index ad7135fc8da..46624587257 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -97,6 +97,7 @@ void u_upload_alloc(struct u_upload_mgr *upload,
 void u_upload_data(struct u_upload_mgr *upload,
                    unsigned min_out_offset,
                    unsigned size,
+                   unsigned alignment,
                    const void *data,
                    unsigned *out_offset,
                    struct pipe_resource **outbuf);
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index dd64e2d7949..6e2c9aab26f 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -987,7 +987,7 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
       real_vb = &mgr->real_vertex_buffer[i];
       ptr = mgr->vertex_buffer[i].user_buffer;
 
-      u_upload_data(mgr->uploader, start, end - start, ptr + start,
+      u_upload_data(mgr->uploader, start, end - start, 4, ptr + start,
                     &real_vb->buffer_offset, &real_vb->buffer);
       if (!real_vb->buffer)
          return PIPE_ERROR_OUT_OF_MEMORY;
diff --git a/src/gallium/drivers/ilo/ilo_gpgpu.c b/src/gallium/drivers/ilo/ilo_gpgpu.c
index 9a2ca007f80..b7415901a88 100644
--- a/src/gallium/drivers/ilo/ilo_gpgpu.c
+++ b/src/gallium/drivers/ilo/ilo_gpgpu.c
@@ -92,7 +92,7 @@ ilo_launch_grid(struct pipe_context *pipe,
    input_buf.buffer_size =
       ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_INPUT_SIZE);
    if (input_buf.buffer_size) {
-      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, input,
+      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, 16, input,
             &input_buf.buffer_offset, &input_buf.buffer);
    }
 
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index d89765a9d23..0232713f7a2 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -376,7 +376,7 @@ finalize_cbuf_state(struct ilo_context *ilo,
       if (cbuf->cso[i].resource)
          continue;
 
-      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
+      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, 16,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
       cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
@@ -426,7 +426,7 @@ finalize_index_buffer(struct ilo_context *ilo)
       unsigned hw_offset;
 
       if (vec->ib.state.user_buffer) {
-         u_upload_data(ilo->uploader, 0, size,
+         u_upload_data(ilo->uploader, 0, size, 16,
                vec->ib.state.user_buffer + offset,
                &hw_offset, &vec->ib.hw_resource);
       } else {
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index b482fa140ed..7eda6753d0d 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -1010,7 +1010,7 @@ static void r300_render_draw_elements(struct vbuf_render* render,
     CS_LOCALS(r300);
     DBG(r300, DBG_DRAW, "r300: render_draw_elements (count: %d)\n", count);
 
-    u_upload_data(r300->uploader, 0, count * 2, indices,
+    u_upload_data(r300->uploader, 0, count * 2, 4, indices,
                   &index_buffer_offset, &index_buffer);
     if (!index_buffer) {
         return;
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index 737a6f5e4f8..42c8e3a0fc5 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -42,7 +42,7 @@ void r300_upload_index_buffer(struct r300_context *r300,
     *index_buffer = NULL;
 
     u_upload_data(r300->uploader,
-                  0, count * index_size,
+                  0, count * index_size, 4,
                   ptr + (*start * index_size),
                   &index_offset,
                   index_buffer);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 3051c9af09c..c3346f29811 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1106,10 +1106,10 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
 				tmpPtr[i] = util_cpu_to_le32(((uint32_t *)ptr)[i]);
 			}
 
-			u_upload_data(rctx->b.uploader, 0, size, tmpPtr, &cb->buffer_offset, &cb->buffer);
+			u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, &cb->buffer_offset, &cb->buffer);
 			free(tmpPtr);
 		} else {
-			u_upload_data(rctx->b.uploader, 0, input->buffer_size, ptr, &cb->buffer_offset, &cb->buffer);
+			u_upload_data(rctx->b.uploader, 0, input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer);
 		}
 		/* account it in gtt */
 		rctx->b.gtt += input->buffer_size;
@@ -1753,7 +1753,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
 						 info.instance_count > 1 ||
 						 info.count*ib.index_size > 20)) {
-			u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size,
+			u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, 256,
 				      ib.user_buffer, &ib.offset, &ib.buffer);
 			ib.user_buffer = NULL;
 		}
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index d5540bec71d..87a5afbbc97 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -842,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			start_offset = start * ib.index_size;
 
 			u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
-				      (char*)ib.user_buffer + start_offset,
+				      256, (char*)ib.user_buffer + start_offset,
 				      &ib.offset, &ib.buffer);
 			if (!ib.buffer)
 				return;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index c00855698b8..9b0b540d3fc 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -319,7 +319,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                         if (vc4->indexbuf.user_buffer) {
                                 prsc = NULL;
                                 u_upload_data(vc4->uploader, 0,
-                                              info->count * index_size,
+                                              info->count * index_size, 4,
                                               vc4->indexbuf.user_buffer,
                                               &offset, &prsc);
                         } else {
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 527f7637cb6..58fe95736d1 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -605,7 +605,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx,
            ib.offset = vctx->index_buffer.offset + info.start * ib.index_size;
 
            if (ib.user_buffer) {
-                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size,
+                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size, 256,
                                  ib.user_buffer, &ib.offset, &ib.buffer);
                    ib.user_buffer = NULL;
            }
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index f14ffea13e1..4e2a03b0eee 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -397,10 +397,9 @@ NineDevice9_ctor( struct NineDevice9 *This,
     if (!This->driver_caps.user_ibufs)
         This->index_uploader = u_upload_create(This->pipe, 128 * 1024, 4, PIPE_BIND_INDEX_BUFFER);
     if (!This->driver_caps.user_cbufs) {
-        unsigned alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
-
+        This->constbuf_alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
         This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
-                                                  alignment, PIPE_BIND_CONSTANT_BUFFER);
+                                                  This->constbuf_alignment, PIPE_BIND_CONSTANT_BUFFER);
     }
 
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
@@ -2955,6 +2954,7 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
         u_upload_data(This->vertex_uploader,
                       0,
                       (info.max_index + 1) * VertexStreamZeroStride, /* XXX */
+                      4,
                       vtxbuf.user_buffer,
                       &vtxbuf.buffer_offset,
                       &vtxbuf.buffer);
@@ -3027,6 +3027,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
                       base,
                       (info.max_index -
                        info.min_index + 1) * VertexStreamZeroStride, /* XXX */
+                      4,
                       (const uint8_t *)vbuf.user_buffer + base,
                       &vbuf.buffer_offset,
                       &vbuf.buffer);
@@ -3039,6 +3040,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
         u_upload_data(This->index_uploader,
                       0,
                       info.count * ibuf.index_size,
+                      4,
                       ibuf.user_buffer,
                       &ibuf.offset,
                       &ibuf.buffer);
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index 98d9c4df06a..cbc1e61f5db 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -128,6 +128,7 @@ struct NineDevice9
     struct u_upload_mgr *vertex_uploader;
     struct u_upload_mgr *index_uploader;
     struct u_upload_mgr *constbuf_uploader;
+    unsigned constbuf_alignment;
 
     struct nine_range_pool range_pool;
 
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index fe8933be69a..fe26086ef3d 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -1866,6 +1866,7 @@ nine_ff_update(struct NineDevice9 *device)
             u_upload_data(device->constbuf_uploader,
                           0,
                           cb.buffer_size,
+                          device->constbuf_alignment,
                           cb.user_buffer,
                           &cb.buffer_offset,
                           &cb.buffer);
@@ -1888,6 +1889,7 @@ nine_ff_update(struct NineDevice9 *device)
             u_upload_data(device->constbuf_uploader,
                           0,
                           cb.buffer_size,
+                          device->constbuf_alignment,
                           cb.user_buffer,
                           &cb.buffer_offset,
                           &cb.buffer);
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 558d07a2bd0..aee31622088 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -260,6 +260,7 @@ prepare_vs_constants_userbuf(struct NineDevice9 *device)
         u_upload_data(device->constbuf_uploader,
                       0,
                       cb.buffer_size,
+                      device->constbuf_alignment,
                       cb.user_buffer,
                       &cb.buffer_offset,
                       &cb.buffer);
@@ -336,6 +337,7 @@ prepare_ps_constants_userbuf(struct NineDevice9 *device)
         u_upload_data(device->constbuf_uploader,
                       0,
                       cb.buffer_size,
+                      device->constbuf_alignment,
                       cb.user_buffer,
                       &cb.buffer_offset,
                       &cb.buffer);
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 20f8b3df99d..66811d29c29 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -84,6 +84,7 @@ void st_upload_constants( struct st_context *st,
          cb.buffer = NULL;
          cb.user_buffer = NULL;
          u_upload_data(st->constbuf_uploader, 0, paramBytes,
+                       st->ctx->Const.UniformBufferOffsetAlignment,
                        params->ParameterValues, &cb.buffer_offset, &cb.buffer);
          u_upload_unmap(st->constbuf_uploader);
       } else {
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 635a0126834..63b46222e6b 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -107,7 +107,7 @@ setup_index_buffer(struct st_context *st,
    else if (st->indexbuf_uploader) {
       /* upload indexes from user memory into a real buffer */
       u_upload_data(st->indexbuf_uploader, 0,
-                    ib->count * ibuffer->index_size, ib->ptr,
+                    ib->count * ibuffer->index_size, 4, ib->ptr,
                     &ibuffer->offset, &ibuffer->buffer);
       if (!ibuffer->buffer) {
          /* out of memory */
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 90eb67711f3..9b01bdc129e 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -314,10 +314,11 @@ void st_init_limits(struct pipe_screen *screen,
    c->GLSLSkipStrictMaxUniformLimitCheck =
       screen->get_param(screen, PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS);
 
+   c->UniformBufferOffsetAlignment =
+      screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
+
    if (can_ubo) {
       extensions->ARB_uniform_buffer_object = GL_TRUE;
-      c->UniformBufferOffsetAlignment =
-         screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
       c->MaxCombinedUniformBlocks = c->MaxUniformBufferBindings =
          c->Program[MESA_SHADER_VERTEX].MaxUniformBlocks +
          c->Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks +

From 1bb79c3a7bee7298b0415ee21a9412c98b1cfee5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 19 Dec 2015 17:15:02 +0100
Subject: [PATCH 035/241] u_upload_mgr: pass alignment to u_upload_buffer
 manually
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/util/u_upload_mgr.c | 3 ++-
 src/gallium/auxiliary/util/u_upload_mgr.h | 1 +
 src/gallium/drivers/ilo/ilo_state.c       | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 646965c4070..3c8d5c997a3 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -262,6 +262,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
                      unsigned min_out_offset,
                      unsigned offset,
                      unsigned size,
+                     unsigned alignment,
                      struct pipe_resource *inbuf,
                      unsigned *out_offset,
                      struct pipe_resource **outbuf)
@@ -283,7 +284,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
    if (0)
       debug_printf("upload ptr %p ofs %d sz %d\n", map, offset, size);
 
-   u_upload_data(upload, min_out_offset, size, upload->alignment,
+   u_upload_data(upload, min_out_offset, size, alignment,
                  map, out_offset, outbuf);
    pipe_buffer_unmap( upload->pipe, transfer );
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 46624587257..e9fe5faaed8 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -113,6 +113,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
                      unsigned min_out_offset,
                      unsigned offset,
                      unsigned size,
+                     unsigned alignment,
                      struct pipe_resource *inbuf,
                      unsigned *out_offset,
                      struct pipe_resource **outbuf);
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 0232713f7a2..8dc2d38e039 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -431,7 +431,7 @@ finalize_index_buffer(struct ilo_context *ilo)
                &hw_offset, &vec->ib.hw_resource);
       } else {
          u_upload_buffer(ilo->uploader, 0,
-               vec->ib.state.offset + offset, size, vec->ib.state.buffer,
+               vec->ib.state.offset + offset, size, 16, vec->ib.state.buffer,
                &hw_offset, &vec->ib.hw_resource);
       }
 

From 37d0aea772a39f9ae7fe3d791e23c1be03ccf9de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 19 Dec 2015 17:43:48 +0100
Subject: [PATCH 036/241] u_upload_mgr: remove alignment parameter from
 u_upload_create
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c          |  2 +-
 src/gallium/auxiliary/indices/u_primconvert.c    |  2 +-
 src/gallium/auxiliary/util/u_blitter.c           |  2 +-
 src/gallium/auxiliary/util/u_upload_mgr.c        |  3 ---
 src/gallium/auxiliary/util/u_upload_mgr.h        |  2 --
 src/gallium/auxiliary/util/u_vbuf.c              |  2 +-
 src/gallium/auxiliary/vl/vl_compositor.c         |  2 +-
 src/gallium/drivers/freedreno/a3xx/fd3_context.c |  3 +--
 src/gallium/drivers/freedreno/a4xx/fd4_context.c |  3 +--
 src/gallium/drivers/ilo/ilo_context.c            |  2 +-
 src/gallium/drivers/r300/r300_context.c          |  2 +-
 src/gallium/drivers/radeon/r600_pipe_common.c    |  2 +-
 src/gallium/drivers/svga/svga_context.c          |  1 -
 src/gallium/drivers/vc4/vc4_context.c            |  2 +-
 src/gallium/drivers/virgl/virgl_context.c        |  2 +-
 src/gallium/state_trackers/nine/device9.c        |  6 +++---
 src/mesa/state_tracker/st_context.c              | 12 ++++--------
 17 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 691de81f20a..de019ebcb02 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -1176,7 +1176,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
 
    hud->pipe = pipe;
    hud->cso = cso;
-   hud->uploader = u_upload_create(pipe, 256 * 1024, 16,
+   hud->uploader = u_upload_create(pipe, 256 * 1024,
                                    PIPE_BIND_VERTEX_BUFFER);
 
    /* font */
diff --git a/src/gallium/auxiliary/indices/u_primconvert.c b/src/gallium/auxiliary/indices/u_primconvert.c
index c0a31548433..e21174a608f 100644
--- a/src/gallium/auxiliary/indices/u_primconvert.c
+++ b/src/gallium/auxiliary/indices/u_primconvert.c
@@ -153,7 +153,7 @@ util_primconvert_draw_vbo(struct primconvert_context *pc,
    }
 
    if (!pc->upload) {
-      pc->upload = u_upload_create(pc->pipe, 4096, 4, PIPE_BIND_INDEX_BUFFER);
+      pc->upload = u_upload_create(pc->pipe, 4096, PIPE_BIND_INDEX_BUFFER);
    }
 
    u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count, 4,
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index a72d77df301..833a79cf5de 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -320,7 +320,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    for (i = 0; i < 4; i++)
       ctx->vertices[i][0][3] = 1; /*v.w*/
 
-   ctx->upload = u_upload_create(pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+   ctx->upload = u_upload_create(pipe, 65536, PIPE_BIND_VERTEX_BUFFER);
 
    return &ctx->base;
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 3c8d5c997a3..842a7f68e27 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -42,7 +42,6 @@ struct u_upload_mgr {
    struct pipe_context *pipe;
 
    unsigned default_size;  /* Minimum size of the upload buffer, in bytes. */
-   unsigned alignment;     /* Alignment of each sub-allocation. */
    unsigned bind;          /* Bitmask of PIPE_BIND_* flags. */
    unsigned map_flags;     /* Bitmask of PIPE_TRANSFER_* flags. */
    boolean map_persistent; /* If persistent mappings are supported. */
@@ -57,7 +56,6 @@ struct u_upload_mgr {
 
 struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
                                       unsigned default_size,
-                                      unsigned alignment,
                                       unsigned bind )
 {
    struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
@@ -66,7 +64,6 @@ struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
 
    upload->pipe = pipe;
    upload->default_size = default_size;
-   upload->alignment = alignment;
    upload->bind = bind;
 
    upload->map_persistent =
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index e9fe5faaed8..54e839bbdff 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -43,12 +43,10 @@ struct pipe_resource;
  *
  * \param pipe          Pipe driver.
  * \param default_size  Minimum size of the upload buffer, in bytes.
- * \param alignment     Alignment of each suballocation in the upload buffer.
  * \param bind          Bitmask of PIPE_BIND_* flags.
  */
 struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
                                       unsigned default_size,
-                                      unsigned alignment,
                                       unsigned bind );
 
 /**
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 6e2c9aab26f..060f3d14eb9 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -314,7 +314,7 @@ u_vbuf_create(struct pipe_context *pipe,
    mgr->translate_cache = translate_cache_create();
    memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
 
-   mgr->uploader = u_upload_create(pipe, 1024 * 1024, 4,
+   mgr->uploader = u_upload_create(pipe, 1024 * 1024,
                                    PIPE_BIND_VERTEX_BUFFER);
 
    return mgr;
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index f160df63aa5..10ac1712f19 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -1091,7 +1091,7 @@ vl_compositor_init(struct vl_compositor *c, struct pipe_context *pipe)
 
    c->pipe = pipe;
 
-   c->upload = u_upload_create(pipe, 128 * 1024, 4, PIPE_BIND_VERTEX_BUFFER);
+   c->upload = u_upload_create(pipe, 128 * 1024, PIPE_BIND_VERTEX_BUFFER);
 
    if (!c->upload)
       return false;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 2413f152f94..edc716e25c8 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -171,8 +171,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd3_query_context_init(pctx);
 
-	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			BORDER_COLOR_UPLOAD_SIZE, 0);
+	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 1037adfebf8..f074a01b886 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -171,8 +171,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd4_query_context_init(pctx);
 
-	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-			BORDER_COLOR_UPLOAD_SIZE, 0);
+	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 2a00cf1c93c..6c885b1bf83 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -189,7 +189,7 @@ ilo_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
     * These must be called last as u_upload/u_blitter are clients of the pipe
     * context.
     */
-   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024, 16,
+   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024,
          PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER);
    if (!ilo->uploader) {
       ilo_context_destroy(&ilo->base);
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index b393769c861..ea47c3fff5b 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -421,7 +421,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.create_video_codec = vl_create_decoder;
     r300->context.create_video_buffer = vl_video_buffer_create;
 
-    r300->uploader = u_upload_create(&r300->context, 256 * 1024, 4,
+    r300->uploader = u_upload_create(&r300->context, 256 * 1024,
                                      PIPE_BIND_CUSTOM);
 
     r300->blitter = util_blitter_create(&r300->context);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index c044b6130a2..7d971209305 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -272,7 +272,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->allocator_so_filled_size)
 		return false;
 
-	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256,
+	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
 					PIPE_BIND_INDEX_BUFFER |
 					PIPE_BIND_CONSTANT_BUFFER);
 	if (!rctx->uploader)
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index 97e649e38ba..460804ccd9d 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -219,7 +219,6 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
 
    svga->const0_upload = u_upload_create(&svga->pipe,
                                          CONST0_UPLOAD_DEFAULT_SIZE,
-                                         CONST0_UPLOAD_ALIGNMENT,
                                          PIPE_BIND_CONSTANT_BUFFER);
    if (!svga->const0_upload)
       goto cleanup;
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 312b006f96e..59c01d86658 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -254,7 +254,7 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         if (!vc4->primconvert)
                 goto fail;
 
-        vc4->uploader = u_upload_create(pctx, 16 * 1024, 4,
+        vc4->uploader = u_upload_create(pctx, 16 * 1024,
                                         PIPE_BIND_INDEX_BUFFER);
 
         vc4_debug |= saved_shaderdb_flag;
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 58fe95736d1..6e74e9a07bc 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -948,7 +948,7 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
                     16, UTIL_SLAB_SINGLETHREADED);
 
    vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask);
-   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, 256,
+   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024,
                                      PIPE_BIND_INDEX_BUFFER);
    if (!vctx->uploader)
            goto fail;
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 4e2a03b0eee..3d3f505a827 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -393,13 +393,13 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
 
     if (!This->driver_caps.user_vbufs)
-        This->vertex_uploader = u_upload_create(This->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+        This->vertex_uploader = u_upload_create(This->pipe, 65536, PIPE_BIND_VERTEX_BUFFER);
     if (!This->driver_caps.user_ibufs)
-        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, 4, PIPE_BIND_INDEX_BUFFER);
+        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, PIPE_BIND_INDEX_BUFFER);
     if (!This->driver_caps.user_cbufs) {
         This->constbuf_alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
         This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
-                                                  This->constbuf_alignment, PIPE_BIND_CONSTANT_BUFFER);
+                                                  PIPE_BIND_CONSTANT_BUFFER);
     }
 
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 1459f258f94..4f6d379723f 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -172,20 +172,16 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    /* Create upload manager for vertex data for glBitmap, glDrawPixels,
     * glClear, etc.
     */
-   st->uploader = u_upload_create(st->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+   st->uploader = u_upload_create(st->pipe, 65536, PIPE_BIND_VERTEX_BUFFER);
 
    if (!screen->get_param(screen, PIPE_CAP_USER_INDEX_BUFFERS)) {
-      st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024, 4,
+      st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024,
                                               PIPE_BIND_INDEX_BUFFER);
    }
 
-   if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS)) {
-      unsigned alignment =
-         screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
-
-      st->constbuf_uploader = u_upload_create(pipe, 128 * 1024, alignment,
+   if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS))
+      st->constbuf_uploader = u_upload_create(pipe, 128 * 1024,
                                               PIPE_BIND_CONSTANT_BUFFER);
-   }
 
    st->cso_context = cso_create_context(pipe);
 

From ecb2da1559bcb9a9eec7ac224c8ff47b026c95ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 19 Dec 2015 17:54:31 +0100
Subject: [PATCH 037/241] u_upload_mgr: allow specifying PIPE_USAGE_* for the
 upload buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/hud/hud_context.c          |  2 +-
 src/gallium/auxiliary/indices/u_primconvert.c    |  3 ++-
 src/gallium/auxiliary/util/u_blitter.c           |  3 ++-
 src/gallium/auxiliary/util/u_upload_mgr.c        | 10 ++++++----
 src/gallium/auxiliary/util/u_upload_mgr.h        |  7 ++++---
 src/gallium/auxiliary/util/u_vbuf.c              |  3 ++-
 src/gallium/auxiliary/vl/vl_compositor.c         |  3 ++-
 src/gallium/drivers/freedreno/a3xx/fd3_context.c |  3 ++-
 src/gallium/drivers/freedreno/a4xx/fd4_context.c |  3 ++-
 src/gallium/drivers/ilo/ilo_context.c            |  3 ++-
 src/gallium/drivers/r300/r300_context.c          |  2 +-
 src/gallium/drivers/radeon/r600_pipe_common.c    |  2 +-
 src/gallium/drivers/svga/svga_context.c          |  3 ++-
 src/gallium/drivers/vc4/vc4_context.c            |  3 ++-
 src/gallium/drivers/virgl/virgl_context.c        |  2 +-
 src/gallium/state_trackers/nine/device9.c        |  8 +++++---
 src/mesa/state_tracker/st_context.c              |  9 ++++++---
 17 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index de019ebcb02..75afebe4919 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -1177,7 +1177,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
    hud->pipe = pipe;
    hud->cso = cso;
    hud->uploader = u_upload_create(pipe, 256 * 1024,
-                                   PIPE_BIND_VERTEX_BUFFER);
+                                   PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
 
    /* font */
    if (!util_font_create(pipe, UTIL_FONT_FIXED_8X13, &hud->font)) {
diff --git a/src/gallium/auxiliary/indices/u_primconvert.c b/src/gallium/auxiliary/indices/u_primconvert.c
index e21174a608f..5effd883f67 100644
--- a/src/gallium/auxiliary/indices/u_primconvert.c
+++ b/src/gallium/auxiliary/indices/u_primconvert.c
@@ -153,7 +153,8 @@ util_primconvert_draw_vbo(struct primconvert_context *pc,
    }
 
    if (!pc->upload) {
-      pc->upload = u_upload_create(pc->pipe, 4096, PIPE_BIND_INDEX_BUFFER);
+      pc->upload = u_upload_create(pc->pipe, 4096, PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
    }
 
    u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count, 4,
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 833a79cf5de..43fbd8e6452 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -320,7 +320,8 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    for (i = 0; i < 4; i++)
       ctx->vertices[i][0][3] = 1; /*v.w*/
 
-   ctx->upload = u_upload_create(pipe, 65536, PIPE_BIND_VERTEX_BUFFER);
+   ctx->upload = u_upload_create(pipe, 65536, PIPE_BIND_VERTEX_BUFFER,
+                                 PIPE_USAGE_STREAM);
 
    return &ctx->base;
 }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 842a7f68e27..aa31ef2a4bd 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -43,6 +43,7 @@ struct u_upload_mgr {
 
    unsigned default_size;  /* Minimum size of the upload buffer, in bytes. */
    unsigned bind;          /* Bitmask of PIPE_BIND_* flags. */
+   unsigned usage;         /* PIPE_USAGE_* */
    unsigned map_flags;     /* Bitmask of PIPE_TRANSFER_* flags. */
    boolean map_persistent; /* If persistent mappings are supported. */
 
@@ -54,9 +55,9 @@ struct u_upload_mgr {
 };
 
 
-struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
-                                      unsigned default_size,
-                                      unsigned bind )
+struct u_upload_mgr *
+u_upload_create(struct pipe_context *pipe, unsigned default_size,
+                unsigned bind, unsigned usage)
 {
    struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
    if (!upload)
@@ -65,6 +66,7 @@ struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
    upload->pipe = pipe;
    upload->default_size = default_size;
    upload->bind = bind;
+   upload->usage = usage;
 
    upload->map_persistent =
       pipe->screen->get_param(pipe->screen,
@@ -146,7 +148,7 @@ u_upload_alloc_buffer(struct u_upload_mgr *upload,
    buffer.target = PIPE_BUFFER;
    buffer.format = PIPE_FORMAT_R8_UNORM; /* want TYPELESS or similar */
    buffer.bind = upload->bind;
-   buffer.usage = PIPE_USAGE_STREAM;
+   buffer.usage = upload->usage;
    buffer.width0 = size;
    buffer.height0 = 1;
    buffer.depth0 = 1;
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 54e839bbdff..1d933d754ae 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -44,10 +44,11 @@ struct pipe_resource;
  * \param pipe          Pipe driver.
  * \param default_size  Minimum size of the upload buffer, in bytes.
  * \param bind          Bitmask of PIPE_BIND_* flags.
+ * \param usage         PIPE_USAGE_*
  */
-struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
-                                      unsigned default_size,
-                                      unsigned bind );
+struct u_upload_mgr *
+u_upload_create(struct pipe_context *pipe, unsigned default_size,
+                unsigned bind, unsigned usage);
 
 /**
  * Destroy the upload manager.
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 060f3d14eb9..e16ee3651e6 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -315,7 +315,8 @@ u_vbuf_create(struct pipe_context *pipe,
    memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
 
    mgr->uploader = u_upload_create(pipe, 1024 * 1024,
-                                   PIPE_BIND_VERTEX_BUFFER);
+                                   PIPE_BIND_VERTEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
 
    return mgr;
 }
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index 10ac1712f19..77688f0f99f 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -1091,7 +1091,8 @@ vl_compositor_init(struct vl_compositor *c, struct pipe_context *pipe)
 
    c->pipe = pipe;
 
-   c->upload = u_upload_create(pipe, 128 * 1024, PIPE_BIND_VERTEX_BUFFER);
+   c->upload = u_upload_create(pipe, 128 * 1024, PIPE_BIND_VERTEX_BUFFER,
+                               PIPE_USAGE_STREAM);
 
    if (!c->upload)
       return false;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index edc716e25c8..e47bbff5643 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -171,7 +171,8 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd3_query_context_init(pctx);
 
-	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0);
+	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index f074a01b886..7d6365bbb6d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -171,7 +171,8 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
 	fd4_query_context_init(pctx);
 
-	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0);
+	fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
 
 	return pctx;
 }
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 6c885b1bf83..6bcd0bcb8f5 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -190,7 +190,8 @@ ilo_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
     * context.
     */
    ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024,
-         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER);
+         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
    if (!ilo->uploader) {
       ilo_context_destroy(&ilo->base);
       return NULL;
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index ea47c3fff5b..82ba0435118 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -422,7 +422,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.create_video_buffer = vl_video_buffer_create;
 
     r300->uploader = u_upload_create(&r300->context, 256 * 1024,
-                                     PIPE_BIND_CUSTOM);
+                                     PIPE_BIND_CUSTOM, PIPE_USAGE_STREAM);
 
     r300->blitter = util_blitter_create(&r300->context);
     if (r300->blitter == NULL)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 7d971209305..0ab4a60a919 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -274,7 +274,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 	rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
 					PIPE_BIND_INDEX_BUFFER |
-					PIPE_BIND_CONSTANT_BUFFER);
+					PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
 	if (!rctx->uploader)
 		return false;
 
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index 460804ccd9d..b10eb45e548 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -219,7 +219,8 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
 
    svga->const0_upload = u_upload_create(&svga->pipe,
                                          CONST0_UPLOAD_DEFAULT_SIZE,
-                                         PIPE_BIND_CONSTANT_BUFFER);
+                                         PIPE_BIND_CONSTANT_BUFFER,
+                                         PIPE_USAGE_STREAM);
    if (!svga->const0_upload)
       goto cleanup;
 
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 59c01d86658..a0888f23265 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -255,7 +255,8 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
                 goto fail;
 
         vc4->uploader = u_upload_create(pctx, 16 * 1024,
-                                        PIPE_BIND_INDEX_BUFFER);
+                                        PIPE_BIND_INDEX_BUFFER,
+                                        PIPE_USAGE_STREAM);
 
         vc4_debug |= saved_shaderdb_flag;
 
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 6e74e9a07bc..c322503d816 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -949,7 +949,7 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
 
    vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask);
    vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024,
-                                     PIPE_BIND_INDEX_BUFFER);
+                                     PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
    if (!vctx->uploader)
            goto fail;
 
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 3d3f505a827..0be83658928 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -393,13 +393,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
 
     if (!This->driver_caps.user_vbufs)
-        This->vertex_uploader = u_upload_create(This->pipe, 65536, PIPE_BIND_VERTEX_BUFFER);
+        This->vertex_uploader = u_upload_create(This->pipe, 65536,
+                                                PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
     if (!This->driver_caps.user_ibufs)
-        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, PIPE_BIND_INDEX_BUFFER);
+        This->index_uploader = u_upload_create(This->pipe, 128 * 1024,
+                                               PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
     if (!This->driver_caps.user_cbufs) {
         This->constbuf_alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
         This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
-                                                  PIPE_BIND_CONSTANT_BUFFER);
+                                                  PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
     }
 
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 4f6d379723f..2fb792d628f 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -172,16 +172,19 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    /* Create upload manager for vertex data for glBitmap, glDrawPixels,
     * glClear, etc.
     */
-   st->uploader = u_upload_create(st->pipe, 65536, PIPE_BIND_VERTEX_BUFFER);
+   st->uploader = u_upload_create(st->pipe, 65536, PIPE_BIND_VERTEX_BUFFER,
+                                  PIPE_USAGE_STREAM);
 
    if (!screen->get_param(screen, PIPE_CAP_USER_INDEX_BUFFERS)) {
       st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024,
-                                              PIPE_BIND_INDEX_BUFFER);
+                                              PIPE_BIND_INDEX_BUFFER,
+                                              PIPE_USAGE_STREAM);
    }
 
    if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS))
       st->constbuf_uploader = u_upload_create(pipe, 128 * 1024,
-                                              PIPE_BIND_CONSTANT_BUFFER);
+                                              PIPE_BIND_CONSTANT_BUFFER,
+                                              PIPE_USAGE_STREAM);
 
    st->cso_context = cso_create_context(pipe);
 

From b6847062dd5c504023dfbef8e6b3118136ee506c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 30 Dec 2015 14:55:34 -0500
Subject: [PATCH 038/241] gallium/radeon: implement set_debug_callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 12 ++++++++++++
 src/gallium/drivers/radeon/r600_pipe_common.h |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 0ab4a60a919..4bfb3591c84 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -227,6 +227,17 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
 	return PIPE_UNKNOWN_CONTEXT_RESET;
 }
 
+static void r600_set_debug_callback(struct pipe_context *ctx,
+				    const struct pipe_debug_callback *cb)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+	if (cb)
+		rctx->debug = *cb;
+	else
+		memset(&rctx->debug, 0, sizeof(rctx->debug));
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen)
 {
@@ -252,6 +263,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->b.transfer_inline_write = u_default_transfer_inline_write;
         rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
+	rctx->b.set_debug_callback = r600_set_debug_callback;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
 		rctx->b.get_device_reset_status = r600_get_reset_status;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c3933b1da98..a69e627a2e9 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -440,6 +440,8 @@ struct r600_common_context {
 	 * the GPU addresses are updated. */
 	struct list_head		texture_buffers;
 
+	struct pipe_debug_callback	debug;
+
 	/* Copy one resource to another using async DMA. */
 	void (*dma_copy)(struct pipe_context *ctx,
 			 struct pipe_resource *dst,

From 4bb1c8dfecef133822511f6147eac317e4690345 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 30 Dec 2015 15:02:57 -0500
Subject: [PATCH 039/241] radeonsi: pass pipe_debug_callback down into
 si_shader_binary_read (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will allow us to send shader debug info.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com> (v1)
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c     |  4 ++--
 src/gallium/drivers/radeonsi/si_shader.c      | 21 ++++++++++++-------
 src/gallium/drivers/radeonsi/si_shader.h      |  9 +++++---
 .../drivers/radeonsi/si_state_shaders.c       |  2 +-
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 47a74eea0e0..2565117581e 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -123,7 +123,7 @@ static void *si_create_compute_state(
 		        LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
                                                         code, header->num_bytes);
 			si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
-					mod);
+					mod, &sctx->b.debug);
 			LLVMDisposeModule(mod);
 		}
 	}
@@ -136,7 +136,7 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader);
+	si_shader_binary_read(sctx->screen, &program->shader, &sctx->b.debug);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0e98784d51b..309219f7bc5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3840,7 +3840,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 	return 0;
 }
 
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
+			  struct pipe_debug_callback *debug)
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
 	unsigned i;
@@ -3878,7 +3879,8 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
 }
 
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod)
+		    LLVMTargetMachineRef tm, LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug)
 {
 	int r = 0;
 	bool dump_asm = r600_can_dump_shader(&sscreen->b,
@@ -3896,7 +3898,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 			return r;
 	}
 
-	r = si_shader_binary_read(sscreen, shader);
+	r = si_shader_binary_read(sscreen, shader, debug);
 
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
@@ -3913,7 +3915,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 				      struct si_shader_context *si_shader_ctx,
-				      struct si_shader *gs, bool dump)
+				      struct si_shader *gs, bool dump,
+				      struct pipe_debug_callback *debug)
 {
 	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -3980,7 +3983,8 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 		fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
 
 	r = si_compile_llvm(sscreen, si_shader_ctx->shader,
-			    si_shader_ctx->tm, bld_base->base.gallivm->module);
+			    si_shader_ctx->tm, bld_base->base.gallivm->module,
+			    debug);
 
 	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
 
@@ -4034,7 +4038,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 }
 
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader)
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug)
 {
 	struct si_shader_selector *sel = shader->selector;
 	struct tgsi_token *tokens = sel->tokens;
@@ -4188,7 +4193,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
 	mod = bld_base->base.gallivm->module;
-	r = si_compile_llvm(sscreen, shader, tm, mod);
+	r = si_compile_llvm(sscreen, shader, tm, mod, debug);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
@@ -4202,7 +4207,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		shader->gs_copy_shader->key = shader->key;
 		si_shader_ctx.shader = shader->gs_copy_shader;
 		if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
-						    shader, dump))) {
+						    shader, dump, debug))) {
 			free(shader->gs_copy_shader);
 			shader->gs_copy_shader = NULL;
 			goto out;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b0c8680ecb3..adcbb332e9d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -327,14 +327,17 @@ static inline bool si_vs_exports_prim_id(struct si_shader *shader)
 
 /* radeonsi_shader.c */
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-		     struct si_shader *shader);
+		     struct si_shader *shader,
+		     struct pipe_debug_callback *debug);
 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
+		    LLVMTargetMachineRef tm, LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
+			  struct pipe_debug_callback *debug);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8700590435f..c7045c31d56 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -616,7 +616,7 @@ static int si_shader_select(struct pipe_context *ctx,
 	shader->selector = sel;
 	shader->key = key;
 
-	r = si_shader_create(sctx->screen, sctx->tm, shader);
+	r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug);
 	if (unlikely(r)) {
 		R600_ERR("Failed to build shader variant (type=%u) %d\n",
 			 sel->type, r);

From f8cd11403a8029ae6e080c59c80f9d649578e5ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sat, 2 Jan 2016 16:30:57 -0500
Subject: [PATCH 040/241] radeonsi: send shader info as debug messages in
 addition to stderr output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The output via stderr is very helpful for ad-hoc debugging tasks, so that remains
unchanged, but having the information available via debug messages as well
will allow the use of parallel shader-db runs.

Shader stats are always provided (if the context is a debug context, that is),
but you still have to enable the appropriate R600_DEBUG flags to get
disassembly (since it is rather spammy and is only generated by LLVM when we
explicitly ask for it).

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 69 +++++++++++++++++++-----
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 309219f7bc5..a34f7da711d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3840,11 +3840,57 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 	return 0;
 }
 
+static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
+				       struct pipe_debug_callback *debug)
+{
+	char *line, *p;
+	unsigned i, count;
+
+	if (binary->disasm_string) {
+		fprintf(stderr, "\nShader Disassembly:\n\n");
+		fprintf(stderr, "%s\n", binary->disasm_string);
+
+		if (debug && debug->debug_message) {
+			/* Very long debug messages are cut off, so send the
+			 * disassembly one line at a time. This causes more
+			 * overhead, but on the plus side it simplifies
+			 * parsing of resulting logs.
+			 */
+			pipe_debug_message(debug, SHADER_INFO,
+					   "Shader Disassembly Begin");
+
+			line = binary->disasm_string;
+			while (*line) {
+				p = strchrnul(line, '\n');
+				count = p - line;
+
+				if (count) {
+					pipe_debug_message(debug, SHADER_INFO,
+							   "%.*s", count, line);
+				}
+
+				if (!*p)
+					break;
+				line = p + 1;
+			}
+
+			pipe_debug_message(debug, SHADER_INFO,
+					   "Shader Disassembly End");
+		}
+	} else {
+		fprintf(stderr, "SI CODE:\n");
+		for (i = 0; i < binary->code_size; i += 4) {
+			fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
+				binary->code[i + 3], binary->code[i + 2],
+				binary->code[i + 1], binary->code[i]);
+		}
+	}
+}
+
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 			  struct pipe_debug_callback *debug)
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
-	unsigned i;
 	int r;
 	bool dump  = r600_can_dump_shader(&sscreen->b,
 		shader->selector ? shader->selector->tokens : NULL);
@@ -3855,19 +3901,8 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 		return r;
 
 	if (dump) {
-		if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
-			if (binary->disasm_string) {
-				fprintf(stderr, "\nShader Disassembly:\n\n");
-				fprintf(stderr, "%s\n", binary->disasm_string);
-			} else {
-				fprintf(stderr, "SI CODE:\n");
-				for (i = 0; i < binary->code_size; i+=4 ) {
-					fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-					binary->code[i + 2], binary->code[i + 1],
-					binary->code[i]);
-				}
-			}
-		}
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
+			si_shader_dump_disassembly(binary, debug);
 
 		fprintf(stderr, "*** SHADER STATS ***\n"
 			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
@@ -3875,6 +3910,12 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 			shader->num_sgprs, shader->num_vgprs, binary->code_size,
 			shader->lds_size, shader->scratch_bytes_per_wave);
 	}
+
+	pipe_debug_message(debug, SHADER_INFO,
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   shader->num_sgprs, shader->num_vgprs, binary->code_size,
+			   shader->lds_size, shader->scratch_bytes_per_wave);
+
 	return 0;
 }
 

From 255ccd1e99e2eb8ad9ae001e3796afc344ca15c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 30 Dec 2015 16:00:56 -0500
Subject: [PATCH 041/241] gallium/radeon: pass pipe_debug_callback into
 radeon_llvm_compile (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will allow us to send shader debug info via the context's debug callback.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com> (v1)
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/evergreen_compute.c  | 2 +-
 src/gallium/drivers/r600/r600_llvm.c          | 5 +++--
 src/gallium/drivers/r600/r600_llvm.h          | 4 +++-
 src/gallium/drivers/r600/r600_shader.c        | 3 ++-
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 6 ++++--
 src/gallium/drivers/radeon/radeon_llvm_emit.h | 4 +++-
 src/gallium/drivers/radeonsi/si_shader.c      | 3 ++-
 7 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index d83eb17c280..20945ece155 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -600,7 +600,7 @@ static void evergreen_launch_grid(
                            ctx->screen->has_compressed_msaa_texturing);
                 bc->type = TGSI_PROCESSOR_COMPUTE;
                 bc->isa = ctx->isa;
-                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
+                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
 
                 if (dump && !sb_disasm) {
                         r600_bytecode_disasm(bc);
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 1cc30317ba5..ef2e2a2a117 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -915,14 +915,15 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	struct r600_bytecode *bc,
 	boolean *use_kill,
-	unsigned dump)
+	unsigned dump,
+	struct pipe_debug_callback *debug)
 {
 	unsigned r;
 	struct radeon_shader_binary binary;
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL, debug);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h
index 9b5304d9fcb..f570b739fbe 100644
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ b/src/gallium/drivers/r600/r600_llvm.h
@@ -7,6 +7,7 @@
 #include "radeon/radeon_llvm.h"
 #include <llvm-c/Core.h>
 
+struct pipe_debug_callback;
 struct r600_bytecode;
 struct r600_shader_ctx;
 struct radeon_llvm_context;
@@ -22,7 +23,8 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	struct r600_bytecode *bc,
 	boolean *use_kill,
-	unsigned dump);
+	unsigned dump,
+	struct pipe_debug_callback *debug);
 
 unsigned r600_create_shader(struct r600_bytecode *bc,
 		const struct radeon_shader_binary *binary,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index d411b0be50e..9c040aeec4a 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -3259,7 +3259,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
 		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
 
-		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
+		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill,
+				      dump, &rctx->b.debug)) {
 			radeon_llvm_dispose(&radeon_llvm_ctx);
 			use_llvm = 0;
 			fprintf(stderr, "R600 LLVM backend failed to compile "
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 61ed9402122..9754fd95453 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -23,10 +23,12 @@
  * Authors: Tom Stellard <thomas.stellard@amd.com>
  *
  */
+
 #include "radeon_llvm_emit.h"
 #include "radeon_elf_util.h"
 #include "c11/threads.h"
 #include "gallivm/lp_bld_misc.h"
+#include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 
@@ -142,9 +144,9 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
 			     const char *gpu_family, bool dump_ir, bool dump_asm,
-			     LLVMTargetMachineRef tm)
+			     LLVMTargetMachineRef tm,
+			     struct pipe_debug_callback *debug)
 {
-
 	char cpu[CPU_STRING_LEN];
 	char fs[FS_STRING_LEN];
 	char *err;
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index e20aed94c6b..29e4dc05a3d 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -31,6 +31,7 @@
 #include <llvm-c/TargetMachine.h>
 #include <stdbool.h>
 
+struct pipe_debug_callback;
 struct radeon_shader_binary;
 
 void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
@@ -39,6 +40,7 @@ LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
 			     const char *gpu_family, bool dump_ir, bool dump_asm,
-			     LLVMTargetMachineRef tm);
+			     LLVMTargetMachineRef tm,
+			     struct pipe_debug_callback *debug);
 
 #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a34f7da711d..270cc20ff10 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3934,7 +3934,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 
 	if (!si_replace_shader(count, &shader->binary)) {
 		r = radeon_llvm_compile(mod, &shader->binary,
-			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
+			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm,
+			debug);
 		if (r)
 			return r;
 	}

From 8f384d07a8b1cbaae0999aa9d0a91ba164fa2fe0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sat, 2 Jan 2016 16:40:47 -0500
Subject: [PATCH 042/241] gallium/radeon: send LLVM diagnostics as debug
 messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Diagnostics sent during code generation and the every error message reported
by LLVMTargetMachineEmitToMemoryBuffer are disjoint reporting mechanisms. We
take care of both and also send an explicit message indicating failure at the
end, so that log parsers can more easily tell the boundary between shader
compiles.

Removed an fprintf that could never be triggered.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 61 ++++++++++++++-----
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 9754fd95453..b765d367dab 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -31,6 +31,7 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
 
 #include <llvm-c/Target.h>
 #include <llvm-c/TargetMachine.h>
@@ -125,16 +126,44 @@ LLVMTargetRef radeon_llvm_get_r600_target(const char *triple)
 	return target;
 }
 
+struct radeon_llvm_diagnostics {
+	struct pipe_debug_callback *debug;
+	unsigned retval;
+};
+
 static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
 {
-	if (LLVMGetDiagInfoSeverity(di) == LLVMDSError) {
-		unsigned int *diagnosticflag = (unsigned int *)context;
-		char *diaginfo_message = LLVMGetDiagInfoDescription(di);
+	struct radeon_llvm_diagnostics *diag = (struct radeon_llvm_diagnostics *)context;
+	LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+	char *description = LLVMGetDiagInfoDescription(di);
+	const char *severity_str = NULL;
 
-		*diagnosticflag = 1;
-		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", diaginfo_message);
-		LLVMDisposeMessage(diaginfo_message);
+	switch (severity) {
+	case LLVMDSError:
+		severity_str = "error";
+		break;
+	case LLVMDSWarning:
+		severity_str = "warning";
+		break;
+	case LLVMDSRemark:
+		severity_str = "remark";
+		break;
+	case LLVMDSNote:
+		severity_str = "note";
+		break;
+	default:
+		severity_str = "unknown";
 	}
+
+	pipe_debug_message(diag->debug, SHADER_INFO,
+			   "LLVM diagnostic (%s): %s", severity_str, description);
+
+	if (severity == LLVMDSError) {
+		diag->retval = 1;
+		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
+	}
+
+	LLVMDisposeMessage(description);
 }
 
 /**
@@ -147,18 +176,21 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 			     LLVMTargetMachineRef tm,
 			     struct pipe_debug_callback *debug)
 {
+	struct radeon_llvm_diagnostics diag;
 	char cpu[CPU_STRING_LEN];
 	char fs[FS_STRING_LEN];
 	char *err;
 	bool dispose_tm = false;
 	LLVMContextRef llvm_ctx;
-	unsigned rval = 0;
 	LLVMMemoryBufferRef out_buffer;
 	unsigned buffer_size;
 	const char *buffer_data;
 	char triple[TRIPLE_STRING_LEN];
 	LLVMBool mem_err;
 
+	diag.debug = debug;
+	diag.retval = 0;
+
 	if (!tm) {
 		strncpy(triple, "r600--", TRIPLE_STRING_LEN);
 		LLVMTargetRef target = radeon_llvm_get_r600_target(triple);
@@ -179,8 +211,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
-	LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &rval);
-	rval = 0;
+	LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &diag);
 
 	/* Compile IR*/
 	mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, &err,
@@ -189,15 +220,13 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Process Errors/Warnings */
 	if (mem_err) {
 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
+		pipe_debug_message(debug, SHADER_INFO,
+				   "LLVM emit error: %s", err);
 		FREE(err);
-		rval = 1;
+		diag.retval = 1;
 		goto out;
 	}
 
-	if (0 != rval) {
-		fprintf(stderr, "%s: Processing Diag Flag\n", __FUNCTION__);
-	}
-
 	/* Extract Shader Code*/
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
@@ -211,5 +240,7 @@ out:
 	if (dispose_tm) {
 		LLVMDisposeTargetMachine(tm);
 	}
-	return rval;
+	if (diag.retval != 0)
+		pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed");
+	return diag.retval;
 }

From 64253fdb2eda545a4988c02a60627a9840c79907 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 2 Jan 2016 17:33:19 -0800
Subject: [PATCH 043/241] vc4: Fix build from upload changes.

---
 src/gallium/drivers/vc4/vc4_resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 308fb9fc77b..036da329987 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -921,7 +921,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
 
         void *data;
         struct pipe_resource *shadow_rsc = NULL;
-        u_upload_alloc(vc4->uploader, 0, count * 2, 4,;
+        u_upload_alloc(vc4->uploader, 0, count * 2, 4,
                        shadow_offset, &shadow_rsc, &data);
         uint16_t *dst = data;
 

From 53a9b6223f4ebf66e8892e04ffe47eb5586eda5c Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 31 Dec 2015 12:47:19 -0800
Subject: [PATCH 044/241] i965: Move 3-src subnr swizzle handling into the vec4
 backend.

While most align16 instructions only support a SubRegNum of 0 or 4
(using swizzling to control the other channels), 3-src instructions
actually support arbitrary SubRegNums.  When the RepCtrl bit is set,
we believe it ignores the swizzle and uses the equivalent of a <0,1,0>
region from the subnr.

In the past, we adopted a vec4-centric approach of specifying subnr of
0 or 4 and a swizzle, then having brw_eu_emit.c convert that to a proper
SubRegNum.  This isn't a great fit for the scalar backend, where we
don't set swizzles at all, and happily set subnrs in the range [0, 7].

This patch changes brw_eu_emit.c to use subnr and swizzle directly,
relying on the higher levels to set them sensibly.

This should fix problems where scalar sources get copy propagated into
3-src instructions in the FS backend.  I've only observed this with
TES push model inputs, but I suppose it could happen in other cases.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_eu_emit.c | 11 +++++------
 src/mesa/drivers/dri/i965/brw_vec4.cpp  | 13 +++++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 5fb96626649..35d8039ed13 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -847,12 +847,11 @@ brw_alu2(struct brw_codegen *p, unsigned opcode,
 static int
 get_3src_subreg_nr(struct brw_reg reg)
 {
-   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
-      assert(brw_is_single_value_swizzle(reg.swizzle));
-      return reg.subnr / 4 + BRW_GET_SWZ(reg.swizzle, 0);
-   } else {
-      return reg.subnr / 4;
-   }
+   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
+    * use 32-bit units (components 0..7).  Since they only support F/D/UD
+    * types, this doesn't lose any flexibility, but uses fewer bits.
+    */
+   return reg.subnr / 4;
 }
 
 static brw_inst *
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index dd223985d1c..c6a52c5d183 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1784,9 +1784,22 @@ vec4_visitor::convert_to_hw_regs()
          case ATTR:
             unreachable("not reached");
          }
+
          src = reg;
       }
 
+      if (inst->is_3src()) {
+         /* 3-src instructions with scalar sources support arbitrary subnr,
+          * but don't actually use swizzles.  Convert swizzle into subnr.
+          */
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+               assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
+               inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
+            }
+         }
+      }
+
       dst_reg &dst = inst->dst;
       struct brw_reg reg;
 

From b022150d70a1cfdda2007fa16b04c601eef45d6f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 27 Dec 2015 16:14:11 -0800
Subject: [PATCH 045/241] i965: Use LOAD_PAYLOAD for SIMD8 TES input loads, not
 MOV.

We need a MOV to replicate g0.0<0,1,0> to all 8 channels.  Since the
message payload is a single register, MOV seemed more sensible than
LOAD_PAYLOAD.  However, MOV cannot be CSE'd, while LOAD_PAYLOAD can.

All input loads can use the same header - we don't need to re-expand
g0 every time.  CSE accomplishes this, saving instructions.

shader-db statistics for files containing tessellation shaders:

total instructions in shared programs: 186923 -> 184358 (-1.37%)
instructions in affected programs: 30536 -> 27971 (-8.40%)
helped: 226
HURT: 0

total cycles in shared programs: 1009850 -> 1005356 (-0.45%)
cycles in affected programs: 168206 -> 163712 (-2.67%)
helped: 226
HURT: 0

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 827dbeeb7b6..788315f6c52 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1850,8 +1850,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
          /* Replicate the patch handle to all enabled channels */
+         const fs_reg srcs[] = {
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+         };
          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.MOV(patch_handle, retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
 
          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
          inst->mlen = 1;

From 4a1c8a3037cd29938b2a6e2c680c341e9903cfbe Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 27 Dec 2015 17:26:30 -0800
Subject: [PATCH 046/241] i965: Push most TES inputs in SIMD8 mode.

Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache.  Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.

However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs.  An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.

Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.

Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.

Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).

Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).

shader-db statistics for files containing tessellation shaders:

total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 40 +++++++++++++++++-------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 788315f6c52..ad347fcdbaf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1849,15 +1849,33 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
 
       fs_inst *inst;
       if (indirect_offset.file == BAD_FILE) {
-         /* Replicate the patch handle to all enabled channels */
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
-         };
-         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+         /* Arbitrarily only push up to 32 vec4 slots worth of data,
+          * which is 16 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 32;
+         if (imm_offset < max_push_slots) {
+            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
+            for (int i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dest, bld, i),
+                       component(src, 4 * (imm_offset % 2) + i));
+            }
+            tes_prog_data->base.urb_read_length =
+               MAX2(tes_prog_data->base.urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+         } else {
+            /* Replicate the patch handle to all enabled channels */
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+            };
+            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
 
-         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
-         inst->mlen = 1;
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
+            inst->mlen = 1;
+            inst->offset = imm_offset;
+            inst->base_mrf = -1;
+            inst->regs_written = instr->num_components;
+         }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
          const fs_reg srcs[] = {
@@ -1869,10 +1887,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
 
          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload);
          inst->mlen = 2;
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
       }
-      inst->offset = imm_offset;
-      inst->base_mrf = -1;
-      inst->regs_written = instr->num_components;
       break;
    }
    default:

From 28dea2662699072715f67ef34e910d278f88f3b1 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 1 Jan 2016 22:27:22 -0800
Subject: [PATCH 047/241] i965: Make TCS precompile use the TES primitive mode
 when available.

If there's a linked TES program, we should just use the actual
primitive mode.  If not, just guess triangles (as we did before).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tcs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 2c925e7f572..7e414260284 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -307,7 +307,9 @@ brw_tcs_precompile(struct gl_context *ctx,
    /* Guess that the input and output patches have the same dimensionality. */
    key.input_vertices = shader_prog->TessCtrl.VerticesOut;
 
-   key.tes_primitive_mode = GL_TRIANGLES;
+   key.tes_primitive_mode =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] ?
+      shader_prog->TessEval.PrimitiveMode : GL_TRIANGLES;
 
    key.outputs_written = prog->OutputsWritten;
    key.patch_outputs_written = prog->PatchOutputsWritten;

From 53dddab78c9bc7fbfd78bf23284ec6d92b70e93b Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 9 Dec 2015 19:53:18 +0100
Subject: [PATCH 048/241] nv50,nvc0: optimize coherent buffer checking at draw
 time

Instead of iterating over all the buffer resources looking for coherent
buffers, we keep track of a context-wide count. This will save some
iterations (and CPU cycles) in 99.99% case because usually coherent
buffers are not so used.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nv50/nv50_context.h       |  3 ++
 src/gallium/drivers/nouveau/nv50/nv50_state.c | 25 +++++++++++
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   | 42 ++++---------------
 .../drivers/nouveau/nvc0/nvc0_context.h       |  3 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 36 ++++++++++++++++
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   | 41 ++++--------------
 6 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 2cebcd99423..712d00ed2d3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -134,9 +134,11 @@ struct nv50_context {
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
    uint16_t constbuf_valid[3];
+   uint16_t constbuf_coherent[3];
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
    struct pipe_index_buffer idxbuf;
    uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -148,6 +150,7 @@ struct nv50_context {
 
    struct pipe_sampler_view *textures[3][PIPE_MAX_SAMPLERS];
    unsigned num_textures[3];
+   uint32_t textures_coherent[3];
    struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[3];
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index de655971b66..cb040439139 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -664,6 +664,17 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
       if (old)
          nv50_screen_tic_unlock(nv50->screen, old);
 
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nv50->textures_coherent[s] |= 1 << i;
+         else
+            nv50->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nv50->textures_coherent[s] &= ~(1 << i);
+      }
+
       pipe_sampler_view_reference(&nv50->textures[s][i], views[i]);
    }
 
@@ -847,13 +858,19 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nv50->constbuf[s][i].u.data = cb->user_buffer;
       nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
+      nv50->constbuf_coherent[s] &= ~(1 << i);
    } else
    if (res) {
       nv50->constbuf[s][i].offset = cb->buffer_offset;
       nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
+      if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nv50->constbuf_coherent[s] |= 1 << i;
+      else
+         nv50->constbuf_coherent[s] &= ~(1 << i);
    } else {
       nv50->constbuf_valid[s] &= ~(1 << i);
+      nv50->constbuf_coherent[s] &= ~(1 << i);
    }
    nv50->constbuf_dirty[s] |= 1 << i;
 
@@ -1003,6 +1020,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    if (!vb) {
       nv50->vbo_user &= ~(((1ull << count) - 1) << start_slot);
       nv50->vbo_constant &= ~(((1ull << count) - 1) << start_slot);
+      nv50->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
       return;
    }
 
@@ -1015,9 +1033,16 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
             nv50->vbo_constant |= 1 << dst_index;
          else
             nv50->vbo_constant &= ~(1 << dst_index);
+         nv50->vtxbufs_coherent &= ~(1 << dst_index);
       } else {
          nv50->vbo_user &= ~(1 << dst_index);
          nv50->vbo_constant &= ~(1 << dst_index);
+
+         if (vb[i].buffer &&
+             vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+            nv50->vtxbufs_coherent |= (1 << dst_index);
+         else
+            nv50->vtxbufs_coherent &= ~(1 << dst_index);
       }
    }
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 2d1aa6abcd2..60fa2bc06a8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -765,7 +765,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    bool tex_dirty = false;
-   int i, s;
+   int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nv50->vb_elt_first = info->min_index + info->index_bias;
@@ -794,27 +794,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
 
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
    for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
-      uint32_t valid = nv50->constbuf_valid[s];
-
-      while (valid && !nv50->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nv50->constbuf[s][i].user)
-            continue;
-
-         res = nv50->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = true;
-      }
+      if (nv50->constbuf_coherent[s])
+         nv50->cb_dirty = true;
    }
 
    /* If there are any coherent constbufs, flush the cache */
@@ -825,15 +807,10 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    for (s = 0; s < 3 && !tex_dirty; ++s) {
-      for (i = 0; i < nv50->num_textures[s] && !tex_dirty; ++i) {
-         if (!nv50->textures[s][i] ||
-             nv50->textures[s][i]->texture->target != PIPE_BUFFER)
-            continue;
-         if (nv50->textures[s][i]->texture->flags &
-             PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            tex_dirty = true;
-      }
+      if (nv50->textures_coherent[s])
+         tex_dirty = true;
    }
+
    if (tex_dirty) {
       BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
       PUSH_DATA (push, 0x20);
@@ -853,12 +830,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, info->start_instance);
    }
 
-   for (i = 0; i < nv50->num_vtxbufs && !nv50->base.vbo_dirty; ++i) {
-      if (!nv50->vtxbuf[i].buffer)
-         continue;
-      if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = true;
-   }
+   nv50->base.vbo_dirty |= !!nv50->vtxbufs_coherent;
 
    if (nv50->base.vbo_dirty) {
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 39b73ecb0c2..12195489691 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -134,10 +134,12 @@ struct nvc0_context {
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
+   uint16_t constbuf_coherent[6];
    bool cb_dirty;
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
    struct pipe_index_buffer idxbuf;
    uint32_t constant_vbos;
    uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -149,6 +151,7 @@ struct nvc0_context {
    struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS];
    unsigned num_textures[6];
    uint32_t textures_dirty[6];
+   uint32_t textures_coherent[6];
    struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[6];
    uint16_t samplers_dirty[6];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 41a824a97a0..24a6c222dd5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -554,6 +554,17 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
          continue;
       nvc0->textures_dirty[s] |= 1 << i;
 
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nvc0->textures_coherent[s] |= 1 << i;
+         else
+            nvc0->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nvc0->textures_coherent[s] &= ~(1 << i);
+      }
+
       if (old) {
          nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
          nvc0_screen_tic_unlock(nvc0->screen, old);
@@ -596,6 +607,17 @@ nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s,
             continue;
          nvc0->textures_dirty[s] |= 1 << i;
 
+         if (views[p] && views[p]->texture) {
+            struct pipe_resource *res = views[p]->texture;
+            if (res->target == PIPE_BUFFER &&
+                (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+               nvc0->textures_coherent[s] |= 1 << i;
+            else
+               nvc0->textures_coherent[s] &= ~(1 << i);
+         } else {
+            nvc0->textures_coherent[s] &= ~(1 << i);
+         }
+
          if (nvc0->textures[s][i]) {
             struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
             nouveau_bufctx_reset(bctx, bin + i);
@@ -842,14 +864,20 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
       nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
    } else
    if (cb) {
       nvc0->constbuf[s][i].offset = cb->buffer_offset;
       nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
+      if (res && res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nvc0->constbuf_coherent[s] |= 1 << i;
+      else
+         nvc0->constbuf_coherent[s] &= ~(1 << i);
    }
    else {
       nvc0->constbuf_valid[s] &= ~(1 << i);
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
    }
 }
 
@@ -1009,6 +1037,7 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
     if (!vb) {
        nvc0->vbo_user &= ~(((1ull << count) - 1) << start_slot);
        nvc0->constant_vbos &= ~(((1ull << count) - 1) << start_slot);
+       nvc0->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
        return;
     }
 
@@ -1021,9 +1050,16 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
              nvc0->constant_vbos |= 1 << dst_index;
           else
              nvc0->constant_vbos &= ~(1 << dst_index);
+          nvc0->vtxbufs_coherent &= ~(1 << dst_index);
        } else {
           nvc0->vbo_user &= ~(1 << dst_index);
           nvc0->constant_vbos &= ~(1 << dst_index);
+
+          if (vb[i].buffer &&
+              vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+             nvc0->vtxbufs_coherent |= (1 << dst_index);
+          else
+             nvc0->vtxbufs_coherent &= ~(1 << dst_index);
        }
     }
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 235b1afc24b..251753357eb 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -871,7 +871,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   int i, s;
+   int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
    nvc0->vb_elt_first = info->min_index + info->index_bias;
@@ -922,27 +922,9 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
    push->kick_notify = nvc0_draw_vbo_kick_notify;
 
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
    for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
-      uint32_t valid = nvc0->constbuf_valid[s];
-
-      while (valid && !nvc0->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nvc0->constbuf[s][i].user)
-            continue;
-
-         res = nvc0->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = true;
-      }
+      if (nvc0->constbuf_coherent[s])
+         nvc0->cb_dirty = true;
    }
 
    if (nvc0->cb_dirty) {
@@ -951,14 +933,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    for (s = 0; s < 5; ++s) {
+      if (!nvc0->textures_coherent[s])
+         continue;
+
       for (int i = 0; i < nvc0->num_textures[s]; ++i) {
          struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
-         struct pipe_resource *res;
-         if (!tic)
-            continue;
-         res = nvc0->textures[s][i]->texture;
-         if (res->target != PIPE_BUFFER ||
-             !(res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+         if (!(nvc0->textures_coherent[s] & (1 << i)))
             continue;
 
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -984,12 +964,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, info->start_instance);
    }
 
-   for (i = 0; i < nvc0->num_vtxbufs && !nvc0->base.vbo_dirty; ++i) {
-      if (!nvc0->vtxbuf[i].buffer)
-         continue;
-      if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = true;
-   }
+   nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
 
    if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
        nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)

From 6a49fcfb1f28b563b89f2b37e82d9f87c0671228 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 25 Nov 2015 01:19:16 +0100
Subject: [PATCH 049/241] gallium/tests: fix build with clang compiler

Nested functions are supported as an extension in GNU C, but Clang
don't support them.

This fixes compilation errors when (manually) building compute.c,
or by setting --enable-gallium-tests to the configure script.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=75165
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/tests/trivial/compute.c | 603 +++++++++++++++-------------
 1 file changed, 330 insertions(+), 273 deletions(-)

diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
index bcdfb11c4f1..5ce12abe227 100644
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -428,6 +428,35 @@ static void launch_grid(struct context *ctx, const uint *block_layout,
         pipe->launch_grid(pipe, block_layout, grid_layout, pc, input);
 }
 
+static void test_default_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xdeadbeef;
+}
+
+/* test_system_values */
+static void test_system_values_expect(void *p, int s, int x, int y)
+{
+        int id = x / 16, sv = (x % 16) / 4, c = x % 4;
+        int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
+        int bsz[] = { 4, 3, 5, 1};
+        int gsz[] = { 5, 4, 1, 1};
+
+        switch (sv) {
+        case 0:
+                *(uint32_t *)p = tid[c] / bsz[c];
+                break;
+        case 1:
+                *(uint32_t *)p = bsz[c];
+                break;
+        case 2:
+                *(uint32_t *)p = gsz[c];
+                break;
+        case 3:
+                *(uint32_t *)p = tid[c] % bsz[c];
+                break;
+        }
+}
+
 static void test_system_values(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -461,44 +490,31 @@ static void test_system_values(struct context *ctx)
                 "  STORE RES[0].xyzw, TEMP[0], SV[3]\n"
                 "  RET\n"
                 "ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                int id = x / 16, sv = (x % 16) / 4, c = x % 4;
-                int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
-                int bsz[] = { 4, 3, 5, 1};
-                int gsz[] = { 5, 4, 1, 1};
-
-                switch (sv) {
-                case 0:
-                        *(uint32_t *)p = tid[c] / bsz[c];
-                        break;
-                case 1:
-                        *(uint32_t *)p = bsz[c];
-                        break;
-                case 2:
-                        *(uint32_t *)p = gsz[c];
-                        break;
-                case 3:
-                        *(uint32_t *)p = tid[c] % bsz[c];
-                        break;
-                }
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 76800, 0, init);
+                 76800, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){4, 3, 5}, (uint []){5, 4, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_system_values_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_resource_access */
+static void test_resource_access_init0(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)x;
+}
+
+static void test_resource_access_expect(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)((x + 4 * y) & 0x3f);
+}
+
 static void test_resource_access(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -519,31 +535,33 @@ static void test_resource_access(struct context *ctx)
                 "       STORE RES[1].xyzw, TEMP[1], TEMP[0]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init0(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)x;
-        }
-        void init1(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)((x + 4*y) & 0x3f);
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init0);
+                 256, 0, test_resource_access_init0);
         init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 60, 12, init1);
+                 60, 12, test_default_init);
         init_compute_resources(ctx, (int []) { 0, 1, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){15, 12, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_resource_access_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_function_calls */
+static void test_function_calls_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 15 * y + x;
+}
+
+static void test_function_calls_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
+}
+
 static void test_function_calls(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -585,26 +603,26 @@ static void test_function_calls(struct context *ctx)
                 "21:  STORE RES[0].x, TEMP[2], TEMP[1].xxxx\n"
                 "22:  RET\n"
                 "23: ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 15 * y + x;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 15, 12, init);
+                 15, 12, test_function_calls_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){3, 3, 3}, (uint []){5, 4, 1}, 15, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_function_calls_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_input_global */
+static void test_input_global_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
+}
+
 static void test_input_global(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -621,35 +639,39 @@ static void test_input_global(struct context *ctx)
                 "       STORE RGLOBAL.x, TEMP[1].yyyy, TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
-        }
         uint32_t input[8] = { 0x10001, 0x10002, 0x10003, 0x10004,
                               0x10005, 0x10006, 0x10007, 0x10008 };
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 32, src, NULL);
-        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
         init_globals(ctx, (int []){ 0, 1, 2, 3, -1 },
                      (uint32_t *[]){ &input[1], &input[3],
                                      &input[5], &input[7] });
         launch_grid(ctx, (uint []){4, 1, 1}, (uint []){1, 1, 1}, 0, input);
-        check_tex(ctx, 0, expect, NULL);
-        check_tex(ctx, 1, expect, NULL);
-        check_tex(ctx, 2, expect, NULL);
-        check_tex(ctx, 3, expect, NULL);
+        check_tex(ctx, 0, test_input_global_expect, NULL);
+        check_tex(ctx, 1, test_input_global_expect, NULL);
+        check_tex(ctx, 2, test_input_global_expect, NULL);
+        check_tex(ctx, 3, test_input_global_expect, NULL);
         destroy_globals(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_private */
+static void test_private_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = (x / 32) + x % 32;
+}
+
 static void test_private(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -691,26 +713,26 @@ static void test_private(struct context *ctx)
                 "       ENDLOOP\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = (x / 32) + x % 32;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 128, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 32768, 0, init);
+                 32768, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){16, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_private_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_local */
+static void test_local_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x & 0x20 ? 2 : 1;
+}
+
 static void test_local(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -778,26 +800,42 @@ static void test_local(struct context *ctx)
                 "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x & 0x20 ? 2 : 1;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 256, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_local_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_sample */
+static void test_sample_init(void *p, int s, int x, int y)
+{
+        *(float *)p = s ? 1 : x * y;
+}
+
+static void test_sample_expect(void *p, int s, int x, int y)
+{
+        switch (x % 4) {
+        case 0:
+                *(float *)p = x / 4 * y;
+                break;
+        case 1:
+        case 2:
+                *(float *)p = 0;
+                break;
+        case 3:
+                *(float *)p = 1;
+                break;
+        }
+}
+
 static void test_sample(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -818,36 +856,19 @@ static void test_sample(struct context *ctx)
                 "       STORE RES[0].xyzw, TEMP[0], TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(float *)p = s ? 1 : x * y;
-        }
-        void expect(void *p, int s, int x, int y) {
-                switch (x % 4) {
-                case 0:
-                        *(float *)p = x / 4 * y;
-                        break;
-                case 1:
-                case 2:
-                        *(float *)p = 0;
-                        break;
-                case 3:
-                        *(float *)p = 1;
-                        break;
-                }
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 128, 32, init);
+                 128, 32, test_sample_init);
         init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 512, 32, init);
+                 512, 32, test_sample_init);
         init_compute_resources(ctx, (int []) { 1, -1 });
         init_sampler_views(ctx, (int []) { 0, -1 });
         init_sampler_states(ctx, 2);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_sample_expect, NULL);
         destroy_sampler_states(ctx);
         destroy_sampler_views(ctx);
         destroy_compute_resources(ctx);
@@ -855,6 +876,12 @@ static void test_sample(struct context *ctx)
         destroy_prog(ctx);
 }
 
+/* test_many_kern */
+static void test_many_kern_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x;
+}
+
 static void test_many_kern(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -883,29 +910,34 @@ static void test_many_kern(struct context *ctx)
                 "       STORE RES[0].x, TEMP[0], IMM[0].wwww\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 16, 0, init);
+                 16, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 5, NULL);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 10, NULL);
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 15, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_many_kern_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_constant */
+static void test_constant_init(void *p, int s, int x, int y)
+{
+        *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
+}
+
+static void test_constant_expect(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)x;
+}
+
 static void test_constant(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -922,28 +954,36 @@ static void test_constant(struct context *ctx)
                 "       STORE RES[1].x, TEMP[0], TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)x;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_constant_init);
         init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_constant_init);
         init_compute_resources(ctx, (int []) { 0, 1, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_constant_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_resource_indirect */
+static void test_resource_indirect_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = s == 0 ? 0xdeadbeef :
+                s == 1 ? x % 2 :
+                s == 2 ? 2 * x :
+                2 * x + 1;
+}
+
+static void test_resource_indirect_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
+}
+
 static void test_resource_indirect(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -961,35 +1001,27 @@ static void test_resource_indirect(struct context *ctx)
                 "       STORE RES[0].x, TEMP[0], TEMP[1]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = s == 0 ? 0xdeadbeef :
-                   s == 1 ? x % 2 :
-                   s == 2 ? 2 * x :
-                   2 * x + 1;
-        }
-        void expect(void *p, int s, int x, int y) {
-           *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 0, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_tex(ctx, 1, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_tex(ctx, 2, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_tex(ctx, 3, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
         init_compute_resources(ctx, (int []) { 0, 1, 2, 3, -1 });
         launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_resource_indirect_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_surface_ld */
 enum pipe_format surface_fmts[] = {
         PIPE_FORMAT_B8G8R8A8_UNORM,
         PIPE_FORMAT_B8G8R8X8_UNORM,
@@ -1023,6 +1055,42 @@ enum pipe_format surface_fmts[] = {
         PIPE_FORMAT_R32G32B32A32_SINT
 };
 
+static void test_surface_ld_init0f(void *p, int s, int x, int y)
+{
+        float v[] = { 1.0, -.75, .50, -.25 };
+        int i = 0;
+
+        util_format_write_4f(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_ld_init0i(void *p, int s, int x, int y)
+{
+        int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+        int i = 0;
+
+        util_format_write_4i(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_ld_expectf(void *p, int s, int x, int y)
+{
+        float v[4], w[4];
+        int i = 0;
+
+        test_surface_ld_init0f(v, s, x / 4, y);
+        util_format_read_4f(surface_fmts[i], w, 0, v, 0, 0, 0, 1, 1);
+        *(float *)p = w[x % 4];
+}
+
+static void test_surface_ld_expecti(void *p, int s, int x, int y)
+{
+        int32_t v[4], w[4];
+        int i = 0;
+
+        test_surface_ld_init0i(v, s, x / 4, y);
+        util_format_read_4i(surface_fmts[i], w, 0, v, 0, 0, 0, 1, 1);
+        *(uint32_t *)p = w[x % 4];
+}
+
 static void test_surface_ld(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -1040,33 +1108,6 @@ static void test_surface_ld(struct context *ctx)
                 "       RET\n"
                 "    ENDSUB\n";
         int i = 0;
-        void init0f(void *p, int s, int x, int y) {
-                float v[] = { 1.0, -.75, .50, -.25 };
-                util_format_write_4f(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void init0i(void *p, int s, int x, int y) {
-                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
-                util_format_write_4i(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void init1(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expectf(void *p, int s, int x, int y) {
-                float v[4], w[4];
-                init0f(v, s, x / 4, y);
-                util_format_read_4f(surface_fmts[i], w, 0,
-                                    v, 0, 0, 0, 1, 1);
-                *(float *)p = w[x % 4];
-        }
-        void expecti(void *p, int s, int x, int y) {
-                int32_t v[4], w[4];
-                init0i(v, s, x / 4, y);
-                util_format_read_4i(surface_fmts[i], w, 0,
-                                    v, 0, 0, 0, 1, 1);
-                *(uint32_t *)p = w[x % 4];
-        }
 
         printf("- %s\n", __func__);
 
@@ -1085,14 +1126,14 @@ static void test_surface_ld(struct context *ctx)
                 }
 
                 init_tex(ctx, 0, PIPE_TEXTURE_2D, true, surface_fmts[i],
-                         128, 32, (is_int ? init0i : init0f));
+                         128, 32, (is_int ? test_surface_ld_init0i : test_surface_ld_init0f));
                 init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                         512, 32, init1);
+                         512, 32, test_default_init);
                 init_compute_resources(ctx, (int []) { 0, 1, -1 });
                 init_sampler_states(ctx, 2);
                 launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
                             NULL);
-                check_tex(ctx, 1, (is_int ? expecti : expectf), NULL);
+                check_tex(ctx, 1, (is_int ? test_surface_ld_expecti : test_surface_ld_expectf), NULL);
                 destroy_sampler_states(ctx);
                 destroy_compute_resources(ctx);
                 destroy_tex(ctx);
@@ -1101,6 +1142,73 @@ static void test_surface_ld(struct context *ctx)
         destroy_prog(ctx);
 }
 
+/* test_surface_st */
+static void test_surface_st_init0f(void *p, int s, int x, int y)
+{
+        float v[] = { 1.0, -.75, 0.5, -.25 };
+        *(float *)p = v[x % 4];
+}
+
+static void test_surface_st_init0i(void *p, int s, int x, int y)
+{
+        int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+        *(int32_t *)p = v[x % 4];
+}
+
+static void test_surface_st_init1(void *p, int s, int x, int y)
+{
+        int i = 0;
+        memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
+}
+
+static void test_surface_st_expectf(void *p, int s, int x, int y)
+{
+        float vf[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0f(&vf[j], s, 4 * x + j, y);
+        util_format_write_4f(surface_fmts[i], vf, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_st_expects(void *p, int s, int x, int y)
+{
+        int32_t v[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0i(&v[j], s, 4 * x + j, y);
+        util_format_write_4i(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_st_expectu(void *p, int s, int x, int y)
+{
+        uint32_t v[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0i(&v[j], s, 4 * x + j, y);
+        util_format_write_4ui(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static bool test_surface_st_check(void *x, void *y, int sz)
+{
+        int i = 0, j;
+
+        if (util_format_is_float(surface_fmts[i])) {
+                return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
+
+        } else if ((sz % 4) == 0) {
+                for (j = 0; j < sz / 4; j++)
+                        if (abs(((uint32_t *)x)[j] -
+                                ((uint32_t *)y)[j]) > 1)
+                                return false;
+                return true;
+        } else {
+                return !memcmp(x, y, sz);
+        }
+}
+
 static void test_surface_st(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -1118,60 +1226,6 @@ static void test_surface_st(struct context *ctx)
                 "       RET\n"
                 "    ENDSUB\n";
         int i = 0;
-        void init0f(void *p, int s, int x, int y) {
-                float v[] = { 1.0, -.75, 0.5, -.25 };
-                *(float *)p = v[x % 4];
-        }
-        void init0i(void *p, int s, int x, int y) {
-                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
-                *(int32_t *)p = v[x % 4];
-        }
-        void init1(void *p, int s, int x, int y) {
-                memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
-        }
-        void expectf(void *p, int s, int x, int y) {
-                float vf[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0f(&vf[j], s, 4 * x + j, y);
-                util_format_write_4f(surface_fmts[i], vf, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void expects(void *p, int s, int x, int y) {
-                int32_t v[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0i(&v[j], s, 4 * x + j, y);
-                util_format_write_4i(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void expectu(void *p, int s, int x, int y) {
-                uint32_t v[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0i(&v[j], s, 4 * x + j, y);
-                util_format_write_4ui(surface_fmts[i], v, 0,
-                                      p, 0, 0, 0, 1, 1);
-        }
-        bool check(void *x, void *y, int sz) {
-                int j;
-
-                if (util_format_is_float(surface_fmts[i])) {
-                        return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
-
-                } else if ((sz % 4) == 0) {
-                        for (j = 0; j < sz / 4; j++)
-                                if (abs(((uint32_t *)x)[j] -
-                                        ((uint32_t *)y)[j]) > 1)
-                                        return false;
-                        return true;
-                } else {
-                        return !memcmp(x, y, sz);
-                }
-        }
 
         printf("- %s\n", __func__);
 
@@ -1192,16 +1246,16 @@ static void test_surface_st(struct context *ctx)
                 }
 
                 init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                         512, 32, (is_int ? init0i : init0f));
+                         512, 32, (is_int ? test_surface_st_init0i : test_surface_st_init0f));
                 init_tex(ctx, 1, PIPE_TEXTURE_2D, true, surface_fmts[i],
-                         128, 32, init1);
+                         128, 32, test_surface_st_init1);
                 init_compute_resources(ctx, (int []) { 0, 1, -1 });
                 init_sampler_states(ctx, 2);
                 launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
                             NULL);
-                check_tex(ctx, 1, (is_int && is_signed ? expects :
-                                   is_int && !is_signed ? expectu :
-                                   expectf), check);
+                check_tex(ctx, 1, (is_int && is_signed ? test_surface_st_expects :
+                                   is_int && !is_signed ? test_surface_st_expectu :
+                                   test_surface_st_expectf), test_surface_st_check);
                 destroy_sampler_states(ctx);
                 destroy_compute_resources(ctx);
                 destroy_tex(ctx);
@@ -1210,6 +1264,12 @@ static void test_surface_st(struct context *ctx)
         destroy_prog(ctx);
 }
 
+/* test_barrier */
+static void test_barrier_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 31;
+}
+
 static void test_barrier(struct context *ctx)
 {
         const char *src = "COMP\n"
@@ -1259,26 +1319,62 @@ static void test_barrier(struct context *ctx)
                 "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
                 "       RET\n"
                 "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 31;
-        }
 
         printf("- %s\n", __func__);
 
         init_prog(ctx, 256, 0, 0, src, NULL);
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_barrier_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_atom_ops */
+static void test_atom_ops_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xbad;
+}
+
+static void test_atom_ops_expect(void *p, int s, int x, int y)
+{
+        switch (x) {
+        case 0:
+                *(uint32_t *)p = 0xce6c8eef;
+                break;
+        case 1:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 2:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 3:
+                *(uint32_t *)p = 0x10011001;
+                break;
+        case 4:
+                *(uint32_t *)p = 0xdfbdbfff;
+                break;
+        case 5:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 6:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 7:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 8:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 9:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        }
+}
+
 static void test_atom_ops(struct context *ctx, bool global)
 {
         const char *src = "COMP\n"
@@ -1381,58 +1477,26 @@ static void test_atom_ops(struct context *ctx, bool global)
                 "       RET\n"
                 "    ENDSUB\n";
 
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xbad;
-        }
-        void expect(void *p, int s, int x, int y) {
-                switch (x) {
-                case 0:
-                        *(uint32_t *)p = 0xce6c8eef;
-                        break;
-                case 1:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 2:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 3:
-                        *(uint32_t *)p = 0x10011001;
-                        break;
-                case 4:
-                        *(uint32_t *)p = 0xdfbdbfff;
-                        break;
-                case 5:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 6:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 7:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 8:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 9:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                }
-        }
-
         printf("- %s (%s)\n", __func__, global ? "global" : "local");
 
         init_prog(ctx, 40, 0, 0, src,
                   (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 40, 0, init);
+                 40, 0, test_atom_ops_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){10, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_atom_ops_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);
 }
 
+/* test_atom_race */
+static void test_atom_race_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
+}
+
 static void test_atom_race(struct context *ctx, bool global)
 {
         const char *src = "COMP\n"
@@ -1551,22 +1615,15 @@ static void test_atom_race(struct context *ctx, bool global)
                 "       RET\n"
                 "    ENDSUB\n";
 
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
-        }
-
         printf("- %s (%s)\n", __func__, global ? "global" : "local");
 
         init_prog(ctx, 256, 0, 0, src,
                   (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
         init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
         init_compute_resources(ctx, (int []) { 0, -1 });
         launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_atom_race_expect, NULL);
         destroy_compute_resources(ctx);
         destroy_tex(ctx);
         destroy_prog(ctx);

From ab4efb19dc4dd5f4e5822f90178f0edba1c4095a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 21 Dec 2015 10:21:29 -0500
Subject: [PATCH 050/241] freedreno/ir3: drop unnecessary unreachable() case

It will still hit a compile_assert() in emit_tex, which has the
advantage of dumping out the offending shader.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 224f7806b3c..bb2221d17d3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1954,8 +1954,6 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		case nir_texop_query_levels:
 			emit_tex_query_levels(ctx, tex);
 			break;
-		case nir_texop_samples_identical:
-			unreachable("nir_texop_samples_identical");
 		default:
 			emit_tex(ctx, tex);
 			break;

From 74135f804a4f18040a0a62664df67d35c8090d1d Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 24 Oct 2015 14:30:31 -0400
Subject: [PATCH 051/241] freedreno/ir3: refactor NIR IR handling

Immediately convert into NIR and do an initial key-agnostic lowering/
optimization pass.  This should let us share most of the per-variant
transformations between each variant, and hopefully minimize the draw-
time variant creation part of the compilation process.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/Makefile.sources        |   1 +
 .../drivers/freedreno/ir3/ir3_cmdline.c       |  15 +-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 118 +++-----------
 src/gallium/drivers/freedreno/ir3/ir3_nir.c   | 144 ++++++++++++++++++
 src/gallium/drivers/freedreno/ir3/ir3_nir.h   |   7 +
 .../drivers/freedreno/ir3/ir3_shader.c        |  24 +--
 .../drivers/freedreno/ir3/ir3_shader.h        |   4 +-
 7 files changed, 202 insertions(+), 111 deletions(-)
 create mode 100644 src/gallium/drivers/freedreno/ir3/ir3_nir.c

diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index baae9144005..74ef4168655 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -128,6 +128,7 @@ ir3_SOURCES := \
 	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
+	ir3/ir3_nir.c \
 	ir3/ir3_nir.h \
 	ir3/ir3_nir_lower_if_else.c \
 	ir3/ir3_print.c \
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d55daeefe06..481859efb17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -40,6 +40,7 @@
 #include "freedreno_util.h"
 
 #include "ir3_compiler.h"
+#include "ir3_nir.h"
 #include "instr-a3xx.h"
 #include "ir3.h"
 
@@ -105,10 +106,10 @@ int main(int argc, char **argv)
 	const char *filename;
 	struct tgsi_token toks[65536];
 	struct tgsi_parse_context parse;
-	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
 	struct ir3_shader s;
 	struct ir3_shader_key key = {};
+	/* TODO cmdline option to target different gpus: */
 	unsigned gpu_id = 320;
 	const char *info;
 	void *ptr;
@@ -228,7 +229,12 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
-	s.tokens = toks;
+	if (fd_mesa_debug & FD_DBG_OPTMSGS)
+		tgsi_dump(toks, 0);
+
+	nir_shader *nir = ir3_tgsi_to_nir(toks);
+	s.compiler = ir3_compiler_create(gpu_id);
+	s.nir = ir3_optimize_nir(&s, nir, NULL);
 
 	v.key = key;
 	v.shader = &s;
@@ -246,11 +252,8 @@ int main(int argc, char **argv)
 		break;
 	}
 
-	/* TODO cmdline option to target different gpus: */
-	compiler = ir3_compiler_create(gpu_id);
-
 	info = "NIR compiler";
-	ret = ir3_compile_shader_nir(compiler, &v);
+	ret = ir3_compile_shader_nir(s.compiler, &v);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bb2221d17d3..0a25d5252a1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -32,10 +32,6 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_strings.h"
-
-#include "nir/tgsi_to_nir.h"
 
 #include "freedreno_util.h"
 
@@ -123,97 +119,10 @@ struct ir3_compile {
 static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
 static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
 
-static struct nir_shader *to_nir(struct ir3_compile *ctx,
-		const struct tgsi_token *tokens, struct ir3_shader_variant *so)
-{
-	static const nir_shader_compiler_options options = {
-			.lower_fpow = true,
-			.lower_fsat = true,
-			.lower_scmp = true,
-			.lower_flrp = true,
-			.lower_ffract = true,
-			.native_integers = true,
-	};
-	struct nir_lower_tex_options tex_options = {
-			.lower_rect = 0,
-	};
-	bool progress;
-
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		tex_options.saturate_s = so->key.fsaturate_s;
-		tex_options.saturate_t = so->key.fsaturate_t;
-		tex_options.saturate_r = so->key.fsaturate_r;
-		break;
-	case SHADER_VERTEX:
-		tex_options.saturate_s = so->key.vsaturate_s;
-		tex_options.saturate_t = so->key.vsaturate_t;
-		tex_options.saturate_r = so->key.vsaturate_r;
-		break;
-	}
-
-	if (ctx->compiler->gpu_id >= 400) {
-		/* a4xx seems to have *no* sam.p */
-		tex_options.lower_txp = ~0;  /* lower all txp */
-	} else {
-		/* a3xx just needs to avoid sam.p for 3d tex */
-		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-	}
-
-	struct nir_shader *s = tgsi_to_nir(tokens, &options);
-
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	nir_opt_global_to_local(s);
-	nir_convert_to_ssa(s);
-	if (s->stage == MESA_SHADER_VERTEX) {
-		nir_lower_clip_vs(s, so->key.ucp_enables);
-	} else if (s->stage == MESA_SHADER_FRAGMENT) {
-		nir_lower_clip_fs(s, so->key.ucp_enables);
-	}
-	nir_lower_tex(s, &tex_options);
-	if (so->key.color_two_side)
-		nir_lower_two_sided_color(s);
-	nir_lower_idiv(s);
-	nir_lower_load_const_to_scalar(s);
-
-	do {
-		progress = false;
-
-		nir_lower_vars_to_ssa(s);
-		nir_lower_alu_to_scalar(s);
-		nir_lower_phis_to_scalar(s);
-
-		progress |= nir_copy_prop(s);
-		progress |= nir_opt_dce(s);
-		progress |= nir_opt_cse(s);
-		progress |= ir3_nir_lower_if_else(s);
-		progress |= nir_opt_algebraic(s);
-		progress |= nir_opt_constant_folding(s);
-
-	} while (progress);
-
-	nir_remove_dead_variables(s);
-	nir_validate_shader(s);
-
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	return s;
-}
 
 static struct ir3_compile *
 compile_init(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens)
+		struct ir3_shader_variant *so)
 {
 	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
 
@@ -239,7 +148,28 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
-	ctx->s = to_nir(ctx, tokens, so);
+	/* TODO: maybe generate some sort of bitmask of what key
+	 * lowers vs what shader has (ie. no need to lower
+	 * texture clamp lowering if no texture sample instrs)..
+	 * although should be done further up the stack to avoid
+	 * creating duplicate variants..
+	 */
+
+	if (ir3_key_lowers_nir(&so->key)) {
+		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+	} else {
+		/* fast-path for shader key that lowers nothing in NIR: */
+		ctx->s = so->shader->nir;
+	}
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
+			so->shader->id, so->id, so->type,
+			so->key.binning_pass, so->key.color_two_side,
+			so->key.half_precision);
+		nir_print_shader(ctx->s, stdout);
+	}
 
 	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
@@ -2497,7 +2427,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	assert(!so->ir);
 
-	ctx = compile_init(compiler, so, so->shader->tokens);
+	ctx = compile_init(compiler, so);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
new file mode 100644
index 00000000000..4d83ee6a987
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,144 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "freedreno_util.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "nir/tgsi_to_nir.h"
+
+struct nir_shader *
+ir3_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+	static const nir_shader_compiler_options options = {
+			.lower_fpow = true,
+			.lower_fsat = true,
+			.lower_scmp = true,
+			.lower_flrp = true,
+			.lower_ffract = true,
+			.native_integers = true,
+	};
+	return tgsi_to_nir(tokens, &options);
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+			key->ucp_enables | key->color_two_side;
+}
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key)
+{
+	struct nir_lower_tex_options tex_options = {
+			.lower_rect = 0,
+	};
+	bool progress;
+
+	if (key) {
+		switch (shader->type) {
+		case SHADER_FRAGMENT:
+		case SHADER_COMPUTE:
+			tex_options.saturate_s = key->fsaturate_s;
+			tex_options.saturate_t = key->fsaturate_t;
+			tex_options.saturate_r = key->fsaturate_r;
+			break;
+		case SHADER_VERTEX:
+			tex_options.saturate_s = key->vsaturate_s;
+			tex_options.saturate_t = key->vsaturate_t;
+			tex_options.saturate_r = key->vsaturate_r;
+			break;
+		}
+	}
+
+	if (shader->compiler->gpu_id >= 400) {
+		/* a4xx seems to have *no* sam.p */
+		tex_options.lower_txp = ~0;  /* lower all txp */
+	} else {
+		/* a3xx just needs to avoid sam.p for 3d tex */
+		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+	}
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	nir_opt_global_to_local(s);
+	nir_convert_to_ssa(s);
+	if (key) {
+		if (s->stage == MESA_SHADER_VERTEX) {
+			nir_lower_clip_vs(s, key->ucp_enables);
+		} else if (s->stage == MESA_SHADER_FRAGMENT) {
+			nir_lower_clip_fs(s, key->ucp_enables);
+		}
+		if (key->color_two_side) {
+			nir_lower_two_sided_color(s);
+		}
+	}
+	nir_lower_tex(s, &tex_options);
+	nir_lower_idiv(s);
+	nir_lower_load_const_to_scalar(s);
+
+	do {
+		progress = false;
+
+		nir_lower_vars_to_ssa(s);
+		nir_lower_alu_to_scalar(s);
+		nir_lower_phis_to_scalar(s);
+
+		progress |= nir_copy_prop(s);
+		progress |= nir_opt_dce(s);
+		progress |= nir_opt_cse(s);
+		progress |= ir3_nir_lower_if_else(s);
+		progress |= nir_opt_algebraic(s);
+		progress |= nir_opt_constant_folding(s);
+
+	} while (progress);
+
+	nir_remove_dead_variables(s);
+	nir_validate_shader(s);
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	nir_sweep(s);
+
+	return s;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
index 9950782dc38..534199d3744 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -32,6 +32,13 @@
 #include "glsl/nir/nir.h"
 #include "glsl/nir/shader_enums.h"
 
+#include "ir3_shader.h"
+
 bool ir3_nir_lower_if_else(nir_shader *shader);
 
+struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key);
+
 #endif /* IR3_NIR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 7b565332256..7d17f426ad3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -39,7 +39,7 @@
 
 #include "ir3_shader.h"
 #include "ir3_compiler.h"
-
+#include "ir3_nir.h"
 
 static void
 delete_variant(struct ir3_shader_variant *v)
@@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	v->key = key;
 	v->type = shader->type;
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
-			key.binning_pass, key.color_two_side, key.half_precision);
-		tgsi_dump(shader->tokens, 0);
-	}
-
 	ret = ir3_compile_shader_nir(shader->compiler, v);
 	if (ret) {
 		debug_error("compile failed!");
@@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader)
 		v = v->next;
 		delete_variant(t);
 	}
-	free((void *)shader->tokens);
+	ralloc_free(shader->nir);
 	free(shader);
 }
 
@@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx,
 	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
-	shader->tokens = tgsi_dup_tokens(cso->tokens);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump tgsi: type=%d", shader->type);
+		tgsi_dump(cso->tokens, 0);
+	}
+	nir_shader *nir = ir3_tgsi_to_nir(cso->tokens);
+	/* do first pass optimization, ignoring the key: */
+	shader->nir = ir3_optimize_nir(shader, nir, NULL);
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump nir%d: type=%d", shader->id, shader->type);
+		nir_print_shader(shader->nir, stdout);
+	}
 	shader->stream_output = cso->stream_output;
 	if (fd_mesa_debug & FD_DBG_SHADERDB) {
 		/* if shader-db run, create a standard variant immediately
 		 * (as otherwise nothing will trigger the shader to be
 		 * actually compiled)
 		 */
-		static struct ir3_shader_key key = {};
+		static struct ir3_shader_key key = {0};
 		ir3_shader_variant(shader, key);
 	}
 	return shader;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index cf99a4c05ed..b3c28a41387 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -230,6 +230,8 @@ struct ir3_shader_variant {
 	struct ir3_shader *shader;
 };
 
+typedef struct nir_shader nir_shader;
+
 struct ir3_shader {
 	enum shader_t type;
 
@@ -240,7 +242,7 @@ struct ir3_shader {
 	struct ir3_compiler *compiler;
 
 	struct pipe_context *pctx;    /* TODO replace w/ pipe_screen */
-	const struct tgsi_token *tokens;
+	nir_shader *nir;
 	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;

From 23bd6affb24662e9e8dbe1ed353babd17b5a016d Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 26 Oct 2015 10:50:35 -0400
Subject: [PATCH 052/241] freedreno/ir3: we require block_index metadata

Found during NIR_TEST_CLONE=1 piglit run.  We were using block->index
but forgetting to require it.  Causing things to not work with a cloned
shader which didn't preserve block_index.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 0a25d5252a1..86afda4ba08 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -2098,6 +2098,8 @@ emit_stream_out(struct ir3_compile *ctx)
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
+	nir_metadata_require(impl, nir_metadata_block_index);
+
 	emit_cf_list(ctx, &impl->body);
 	emit_block(ctx, impl->end_block);
 

From 317628dbb35d03d1e855332c892594ae491c5d24 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 18 Nov 2015 16:33:41 -0500
Subject: [PATCH 053/241] nir: extract out helper macros for running passes

Note these are a bit uglier, due to avoidance of GNU C extensions.  But
drivers which do not need to be built with compilers that don't support
the extension can wrap these macros with their own.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/glsl/nir/nir.h                  | 34 +++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_nir.c | 43 ++++++-----------------------
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index a8972ac6ad5..42867382544 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1903,12 +1903,46 @@ nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
 void nir_validate_shader(nir_shader *shader);
 void nir_metadata_set_validation_flag(nir_shader *shader);
 void nir_metadata_check_validation_flag(nir_shader *shader);
+
+#include "util/debug.h"
+static inline bool
+should_clone_nir(void)
+{
+   static int should_clone = -1;
+   if (should_clone < 0)
+      should_clone = env_var_as_boolean("NIR_TEST_CLONE", false);
+
+   return should_clone;
+}
 #else
 static inline void nir_validate_shader(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
+static inline bool should_clone_nir(void) { return false; }
 #endif /* DEBUG */
 
+#define _PASS(nir, do_pass) do {                                     \
+   do_pass                                                           \
+   nir_validate_shader(nir);                                         \
+   if (should_clone_nir()) {                                         \
+      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
+      ralloc_free(nir);                                              \
+      nir = clone;                                                   \
+   }                                                                 \
+} while (0)
+
+#define NIR_PASS(progress, nir, pass, ...) _PASS(nir,                \
+   nir_metadata_set_validation_flag(nir);                            \
+   if (pass(nir, ##__VA_ARGS__)) {                                   \
+      progress = true;                                               \
+      nir_metadata_check_validation_flag(nir);                       \
+   }                                                                 \
+)
+
+#define NIR_PASS_V(nir, pass, ...) _PASS(nir,                        \
+   pass(nir, ##__VA_ARGS__);                                         \
+)
+
 void nir_calc_dominance_impl(nir_function_impl *impl);
 void nir_calc_dominance(nir_shader *shader);
 
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index e031173036a..f8b258bf96c 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -405,42 +405,15 @@ brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
    }
 }
 
-#include "util/debug.h"
+#define OPT(pass, ...) ({                                  \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress)                                      \
+      progress = true;                                     \
+   this_progress;                                          \
+})
 
-static bool
-should_clone_nir()
-{
-   static int should_clone = -1;
-   if (should_clone < 1)
-      should_clone = env_var_as_boolean("NIR_TEST_CLONE", false);
-
-   return should_clone;
-}
-
-#define _OPT(do_pass) (({                                            \
-   bool this_progress = true;                                        \
-   do_pass                                                           \
-   nir_validate_shader(nir);                                         \
-   if (should_clone_nir()) {                                         \
-      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
-      ralloc_free(nir);                                              \
-      nir = clone;                                                   \
-   }                                                                 \
-   this_progress;                                                    \
-}))
-
-#define OPT(pass, ...) _OPT(                   \
-   nir_metadata_set_validation_flag(nir);      \
-   this_progress = pass(nir ,##__VA_ARGS__);   \
-   if (this_progress) {                        \
-      progress = true;                         \
-      nir_metadata_check_validation_flag(nir); \
-   }                                           \
-)
-
-#define OPT_V(pass, ...) _OPT( \
-   pass(nir, ##__VA_ARGS__);   \
-)
+#define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
 
 static nir_shader *
 nir_optimize(nir_shader *nir, bool is_scalar)

From 3684e899ea545c8cc7becc5f39ed69f43d430794 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 24 Oct 2015 14:54:56 -0400
Subject: [PATCH 054/241] freedreno/ir3: use NIR_PASS helper macros

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_nir.c | 47 ++++++++++++---------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index 4d83ee6a987..565b9c32c1d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -58,6 +58,14 @@ ir3_key_lowers_nir(const struct ir3_shader_key *key)
 			key->ucp_enables | key->color_two_side;
 }
 
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
 struct nir_shader *
 ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 		const struct ir3_shader_key *key)
@@ -97,40 +105,41 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 		debug_printf("----------------------\n");
 	}
 
-	nir_opt_global_to_local(s);
-	nir_convert_to_ssa(s);
+	OPT_V(s, nir_opt_global_to_local);
+	OPT_V(s, nir_convert_to_ssa);
+
 	if (key) {
 		if (s->stage == MESA_SHADER_VERTEX) {
-			nir_lower_clip_vs(s, key->ucp_enables);
+			OPT_V(s, nir_lower_clip_vs, key->ucp_enables);
 		} else if (s->stage == MESA_SHADER_FRAGMENT) {
-			nir_lower_clip_fs(s, key->ucp_enables);
+			OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
 		}
 		if (key->color_two_side) {
-			nir_lower_two_sided_color(s);
+			OPT_V(s, nir_lower_two_sided_color);
 		}
 	}
-	nir_lower_tex(s, &tex_options);
-	nir_lower_idiv(s);
-	nir_lower_load_const_to_scalar(s);
+
+	OPT_V(s, nir_lower_tex, &tex_options);
+	OPT_V(s, nir_lower_idiv);
+	OPT_V(s, nir_lower_load_const_to_scalar);
 
 	do {
 		progress = false;
 
-		nir_lower_vars_to_ssa(s);
-		nir_lower_alu_to_scalar(s);
-		nir_lower_phis_to_scalar(s);
+		OPT_V(s, nir_lower_vars_to_ssa);
+		OPT_V(s, nir_lower_alu_to_scalar);
+		OPT_V(s, nir_lower_phis_to_scalar);
 
-		progress |= nir_copy_prop(s);
-		progress |= nir_opt_dce(s);
-		progress |= nir_opt_cse(s);
-		progress |= ir3_nir_lower_if_else(s);
-		progress |= nir_opt_algebraic(s);
-		progress |= nir_opt_constant_folding(s);
+		progress |= OPT(s, nir_copy_prop);
+		progress |= OPT(s, nir_opt_dce);
+		progress |= OPT(s, nir_opt_cse);
+		progress |= OPT(s, ir3_nir_lower_if_else);
+		progress |= OPT(s, nir_opt_algebraic);
+		progress |= OPT(s, nir_opt_constant_folding);
 
 	} while (progress);
 
-	nir_remove_dead_variables(s);
-	nir_validate_shader(s);
+	OPT_V(s, nir_remove_dead_variables);
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		debug_printf("----------------------\n");

From bf34748b39a8ae81f314db083eb73bb0be4e9c1d Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 3 Jan 2016 11:29:09 -0500
Subject: [PATCH 055/241] nouveau: fix double-const qualifier

Reported by Tom^ on IRC. The original intent was to mark the pointer
constant as well as the data being pointed to, so move the *.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/drivers/dri/nouveau/nouveau_driver.c | 2 +-
 src/mesa/drivers/dri/nouveau/nouveau_driver.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.c b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
index 7f31b2851e4..998e751fc3c 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
@@ -35,7 +35,7 @@
 
 #include "drivers/common/meta.h"
 
-const char const *nouveau_vendor_string = "Nouveau";
+const char * const nouveau_vendor_string = "Nouveau";
 
 const char *
 nouveau_get_renderer_string(unsigned chipset)
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.h b/src/mesa/drivers/dri/nouveau/nouveau_driver.h
index a4273a554bd..237e9563246 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.h
@@ -69,7 +69,7 @@ struct nouveau_driver {
 #define nouveau_error(format, ...) \
 	fprintf(stderr, "%s: " format, __func__, ## __VA_ARGS__)
 
-extern const char const *nouveau_vendor_string;
+extern const char * const nouveau_vendor_string;
 
 const char *
 nouveau_get_renderer_string(unsigned chipset);

From 0ab2c21b938993fe6b42e623e09103185b0deee8 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Sun, 3 Jan 2016 18:40:39 +0100
Subject: [PATCH 056/241] st/mesa: fix parameter names for tesseval/tessctrl
 prototypes

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/state_tracker/st_program.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index a8571f0c441..a74531581b4 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st,
 
 extern struct st_tcp_variant *
 st_get_tcp_variant(struct st_context *st,
-                   struct st_tessctrl_program *stgp,
+                   struct st_tessctrl_program *sttcp,
                    const struct st_tcp_variant_key *key);
 
 extern struct st_tep_variant *
 st_get_tep_variant(struct st_context *st,
-                   struct st_tesseval_program *stgp,
+                   struct st_tesseval_program *sttep,
                    const struct st_tep_variant_key *key);
 
 extern void
@@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st,
 
 extern void
 st_release_tcp_variants(struct st_context *st,
-                        struct st_tessctrl_program *stgp);
+                        struct st_tessctrl_program *sttcp);
 
 extern void
 st_release_tep_variants(struct st_context *st,
-                        struct st_tesseval_program *stgp);
+                        struct st_tesseval_program *sttep);
 
 extern void
 st_destroy_program_variants(struct st_context *st);

From 6eb74b87b8c1ab7065e16b9f5739d59c820e6128 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 29 Oct 2015 02:52:55 -0400
Subject: [PATCH 057/241] gallium: document PK2H/UP2H

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/docs/source/tgsi.rst | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 955ece89da5..2149d08419a 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -458,7 +458,11 @@ while DDY is allowed to be the same for the entire 2x2 quad.
 
 .. opcode:: PK2H - Pack Two 16-bit Floats
 
-  TBD
+This instruction replicates its result.
+
+.. math::
+
+  dst = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16
 
 
 .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
@@ -615,7 +619,15 @@ This instruction replicates its result.
 
 .. opcode:: UP2H - Unpack Two 16-Bit Floats
 
-  TBD
+.. math::
+
+  dst.x = f16\_to\_f32(src0.x \& 0xffff)
+
+  dst.y = f16\_to\_f32(src0.x >> 16)
+
+  dst.z = f16\_to\_f32(src0.x \& 0xffff)
+
+  dst.w = f16\_to\_f32(src0.x >> 16)
 
 .. note::
 

From 459e4532af44ea4490bbb4085a7487c40ae952cc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 29 Oct 2015 17:52:46 -0400
Subject: [PATCH 058/241] tgsi: update PK2H/UP2H channel behavior info

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_info.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 3b40c3de97d..c078b6f94ee 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -77,10 +77,10 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 1, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
    { 1, 1, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
    { 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK2H", TGSI_OPCODE_PK2H },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK2US", TGSI_OPCODE_PK2US },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK4B", TGSI_OPCODE_PK4B },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK4UB", TGSI_OPCODE_PK4UB },
+   { 1, 1, 0, 0, 0, 0, REPL, "PK2H", TGSI_OPCODE_PK2H },
+   { 1, 1, 0, 0, 0, 0, REPL, "PK2US", TGSI_OPCODE_PK2US },
+   { 1, 1, 0, 0, 0, 0, REPL, "PK4B", TGSI_OPCODE_PK4B },
+   { 1, 1, 0, 0, 0, 0, REPL, "PK4UB", TGSI_OPCODE_PK4UB },
    { 0, 1, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
    { 1, 2, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
    { 0, 1, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
@@ -92,10 +92,10 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 2, 1, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
    { 1, 4, 1, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
    { 1, 2, 1, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP2H", TGSI_OPCODE_UP2H },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP2US", TGSI_OPCODE_UP2US },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP4B", TGSI_OPCODE_UP4B },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP4UB", TGSI_OPCODE_UP4UB },
+   { 1, 1, 0, 0, 0, 0, CHAN, "UP2H", TGSI_OPCODE_UP2H },
+   { 1, 1, 0, 0, 0, 0, CHAN, "UP2US", TGSI_OPCODE_UP2US },
+   { 1, 1, 0, 0, 0, 0, CHAN, "UP4B", TGSI_OPCODE_UP4B },
+   { 1, 1, 0, 0, 0, 0, CHAN, "UP4UB", TGSI_OPCODE_UP4UB },
    { 0, 1, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
    { 0, 1, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
    { 1, 1, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },

From e9f43d6333657d3f6c47c656fbbe18aaebbe804a Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 2 Jan 2016 18:55:48 -0500
Subject: [PATCH 059/241] gallium: add PIPE_CAP_TGSI_PACK_HALF_FLOAT to
 indicate UP2H/PK2H support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/docs/source/screen.rst               | 2 ++
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/drivers/virgl/virgl_screen.c         | 1 +
 src/gallium/include/pipe/p_defines.h             | 1 +
 16 files changed, 17 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 41bd0f81e0e..30d497f59e3 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -285,6 +285,8 @@ The integer capabilities:
 * ``PIPE_CAP_DRAW_PARAMETERS``: Whether ``TGSI_SEMANTIC_BASEVERTEX``,
   ``TGSI_SEMANTIC_BASEINSTANCE``, and ``TGSI_SEMANTIC_DRAWID`` are
   supported in vertex shaders.
+* ``PIPE_CAP_TGSI_PACK_HALF_FLOAT``: Whether the ``UP2H`` and ``PK2H``
+  TGSI opcodes are supported.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 4b6d6af0837..4b377b4b087 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -239,6 +239,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 14bd8d797d2..7eab1755c5a 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -255,6 +255,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index ac29b5605bd..6b8e619d32a 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -477,6 +477,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 535296317ab..f4a51ce0a49 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -302,6 +302,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 02303bb79f2..6ef949de809 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -175,6 +175,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index b3f2492fe64..5cc000a5d68 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -218,6 +218,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index f029d164436..57ef75db988 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -207,6 +207,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index e3f2505e139..c605ce5a7b8 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -201,6 +201,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
         case PIPE_CAP_CLEAR_TEXTURE:
         case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 8208686dcb0..d71082fddfd 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -349,6 +349,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index fda5a1eed0b..79bbc48d1f4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -341,6 +341,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 7a35a2a81fb..6af96d9edaa 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -252,6 +252,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index fe8e75e1f66..8ad3c87b66f 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -349,6 +349,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index e845dfdedfe..a4acf2cdb0f 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -190,6 +190,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
+	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 668f2e01cdd..0418cbb7964 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -220,6 +220,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index d4933e74884..d6881f90274 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -636,6 +636,7 @@ enum pipe_cap
    PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
    PIPE_CAP_CLEAR_TEXTURE,
    PIPE_CAP_DRAW_PARAMETERS,
+   PIPE_CAP_TGSI_PACK_HALF_FLOAT,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From 20dee333f3790b0a8f197efeaa54f91522cb606a Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 29 Oct 2015 02:52:57 -0400
Subject: [PATCH 060/241] st/mesa: use PK2H/UP2H when supported

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/state_tracker/st_context.c        |  2 ++
 src/mesa/state_tracker/st_context.h        |  1 +
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 16 +++++++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 2fb792d628f..724c3c5938a 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -248,6 +248,8 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
           PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600));
    st->has_time_elapsed =
       screen->get_param(screen, PIPE_CAP_QUERY_TIME_ELAPSED);
+   st->has_half_float_packing =
+      screen->get_param(screen, PIPE_CAP_TGSI_PACK_HALF_FLOAT);
 
    /* GL limits and extensions */
    st_init_limits(st->pipe->screen, &ctx->Const, &ctx->Extensions);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 60a9a4bb0d5..276fa63223e 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -101,6 +101,7 @@ struct st_context
    boolean prefer_blit_based_texture_transfer;
    boolean force_persample_in_shader;
    boolean has_shareable_shaders;
+   boolean has_half_float_packing;
 
    /**
     * If a shader can be created when we get its source.
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index cdbe2f4f8a8..2adb57d11ad 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2163,15 +2163,20 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
 
+   case ir_unop_pack_half_2x16:
+      emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
+      break;
+   case ir_unop_unpack_half_2x16:
+      emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
+      break;
+
    case ir_unop_pack_snorm_2x16:
    case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_half_2x16:
    case ir_unop_pack_snorm_4x8:
    case ir_unop_pack_unorm_4x8:
 
    case ir_unop_unpack_snorm_2x16:
    case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_half_2x16:
    case ir_unop_unpack_half_2x16_split_x:
    case ir_unop_unpack_half_2x16_split_y:
    case ir_unop_unpack_snorm_4x8:
@@ -5853,13 +5858,14 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                LOWER_PACK_SNORM_4x8 |
                                LOWER_UNPACK_SNORM_4x8 |
                                LOWER_UNPACK_UNORM_4x8 |
-                               LOWER_PACK_UNORM_4x8 |
-                               LOWER_PACK_HALF_2x16 |
-                               LOWER_UNPACK_HALF_2x16;
+                               LOWER_PACK_UNORM_4x8;
 
          if (ctx->Extensions.ARB_gpu_shader5)
             lower_inst |= LOWER_PACK_USE_BFI |
                           LOWER_PACK_USE_BFE;
+         if (!ctx->st->has_half_float_packing)
+            lower_inst |= LOWER_PACK_HALF_2x16 |
+                          LOWER_UNPACK_HALF_2x16;
 
          lower_packing_builtins(ir, lower_inst);
       }

From 34217018c4ae9c2c672534494be0c5b9569609e2 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 29 Oct 2015 02:52:56 -0400
Subject: [PATCH 061/241] nvc0/ir: add support for PK2H/UP2H

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp    |  1 +
 .../nouveau/codegen/nv50_ir_emit_nvc0.cpp     |  5 ++++-
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 22 +++++++++++++++++++
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  2 +-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index e9ddd366391..ec74e7ac811 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F()
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
    emitFMZ  (0x2c, 1);
+   emitField(0x29, 1, insn->subOp);
    emitRND  (0x27, rnd, 0x2a);
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
    emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 1d4f0d92f6b..0b28047e22b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
 
       // for 8/16 source types, the byte/word is in subOp. word 1 is
       // represented as 2.
-      code[1] |= i->subOp << 0x17;
+      if (!isFloatType(i->sType))
+         code[1] |= i->subOp << 0x17;
+      else
+         code[1] |= i->subOp << 0x18;
 
       if (sat)
          code[0] |= 0x20;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index beb67fe20f1..0d41c023db0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const
          x |= 2;
       return x;
    }
+   case TGSI_OPCODE_PK2H:
+      return 0x3;
+   case TGSI_OPCODE_UP2H:
+      return 0x1;
    default:
       break;
    }
@@ -452,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_ATOMUMAX:
    case TGSI_OPCODE_UBFE:
    case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_UP2H:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_I2D:
@@ -516,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const
    case TGSI_OPCODE_DSGE:
    case TGSI_OPCODE_DSLT:
    case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_PK2H:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_U2F:
    case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_UP2H:
       return nv50_ir::TYPE_F32;
    case TGSI_OPCODE_I2D:
    case TGSI_OPCODE_U2D:
@@ -2807,6 +2814,21 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
          mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
       break;
+   case TGSI_OPCODE_PK2H:
+      val0 = getScratch();
+      val1 = getScratch();
+      mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0));
+      mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1));
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp3(OP_INSBF, TYPE_U32, dst0[c], val1, mkImm(0x1010), val0);
+      break;
+   case TGSI_OPCODE_UP2H:
+      src0 = fetchSrc(0, 0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         geni = mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F16, src0);
+         geni->subOp = c & 1;
+      }
+      break;
    case TGSI_OPCODE_EMIT:
       /* export the saved viewport index */
       if (viewport != NULL) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 57ef75db988..174d35df1b3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -185,6 +185,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -207,7 +208,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:

From dd79034ca68be7216615c824bac07ccae889004f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 30 Dec 2015 15:04:26 +0100
Subject: [PATCH 062/241] radeonsi: rename shader parameter definitions and
 variables for more clarity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c |  8 +--
 src/gallium/drivers/radeonsi/si_shader.c      | 62 +++++++++----------
 src/gallium/drivers/radeonsi/si_shader.h      | 16 ++---
 3 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5b0ad8f5622..d157a9ffb00 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1011,19 +1011,19 @@ void si_init_all_descriptors(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_init_buffer_resources(&sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
 					 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
 		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 		si_init_descriptors(&sctx->samplers[i].views.desc,
-				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+				    SI_SGPR_SAMPLER_VIEWS, 8, SI_NUM_SAMPLER_VIEWS);
 		si_init_descriptors(&sctx->samplers[i].states.desc,
-				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
+				    SI_SGPR_SAMPLER_STATES, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
 			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 270cc20ff10..b453d97fa7e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -82,11 +82,11 @@ struct si_shader_context
 	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
-	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
+	LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
-	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
+	LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS];
+	LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
@@ -394,7 +394,7 @@ static void declare_input_vs(
 	LLVMValueRef input;
 
 	/* Load the T list */
-	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER);
+	t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 
 	t_offset = lp_build_const_int32(gallivm, input_index);
 
@@ -1065,7 +1065,7 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld,
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
-	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
 	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
 
@@ -1233,13 +1233,13 @@ static LLVMValueRef fetch_constant(
 	}
 
 	if (reg->Register.Dimension && reg->Dimension.Indirect) {
-		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 		LLVMValueRef index;
 		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
 						   reg->Dimension.Index);
 		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
 	} else
-		bufp = si_shader_ctx->const_resource[buf];
+		bufp = si_shader_ctx->const_buffers[buf];
 
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -1260,7 +1260,7 @@ static LLVMValueRef fetch_constant(
 		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
 				     lp_build_const_int32(base->gallivm, idx * 4));
 
-		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf],
 				   addr2, bld_base->base.elem_type);
 
 		result = radeon_llvm_emit_fetch_double(bld_base,
@@ -1432,7 +1432,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
 	unsigned chan;
 	unsigned const_chan;
 	LLVMValueRef base_elt;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 	LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
 	LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
 
@@ -2390,10 +2390,10 @@ static void tex_fetch_ptrs(
 
 		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
 
-		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		*res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
 		*res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index);
 
-		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+		*samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
 		*samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index);
 
 		if (target == TGSI_TEXTURE_2D_MSAA ||
@@ -2401,13 +2401,13 @@ static void tex_fetch_ptrs(
 			ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
 						 lp_build_const_int32(gallivm,
 								      SI_FMASK_TEX_OFFSET), "");
-			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+			*fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
 			*fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index);
 		}
 	} else {
-		*res_ptr = si_shader_ctx->resources[sampler_index];
-		*samp_ptr = si_shader_ctx->samplers[sampler_index];
-		*fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+		*res_ptr = si_shader_ctx->sampler_views[sampler_index];
+		*samp_ptr = si_shader_ctx->sampler_states[sampler_index];
+		*fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index];
 	}
 }
 
@@ -3432,15 +3432,15 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 	v16i8 = LLVMVectorType(i8, 16);
 
 	params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
-	params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
-	params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
-	params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
-	last_array_pointer = SI_PARAM_RESOURCE;
+	params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
+	params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
+	params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
+	last_array_pointer = SI_PARAM_SAMPLER_VIEWS;
 
 	switch (si_shader_ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
-		params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
-		last_array_pointer = SI_PARAM_VERTEX_BUFFER;
+		params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
+		last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
 		params[SI_PARAM_BASE_VERTEX] = i32;
 		params[SI_PARAM_START_INSTANCE] = i32;
 		num_params = SI_PARAM_START_INSTANCE+1;
@@ -3452,8 +3452,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
 			if (shader->is_gs_copy_shader) {
-				last_array_pointer = SI_PARAM_CONST;
-				num_params = SI_PARAM_CONST+1;
+				last_array_pointer = SI_PARAM_CONST_BUFFERS;
+				num_params = SI_PARAM_CONST_BUFFERS+1;
 			} else {
 				params[SI_PARAM_VS_STATE_BITS] = i32;
 				num_params = SI_PARAM_VS_STATE_BITS+1;
@@ -3610,7 +3610,7 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	const struct tgsi_shader_info * info = bld_base->info;
 	unsigned buf;
-	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 
 	for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
 		unsigned i, num_const = info->const_file_max[buf] + 1;
@@ -3622,14 +3622,14 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
 		si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
 
 		/* Load the resource descriptor */
-		si_shader_ctx->const_resource[buf] =
+		si_shader_ctx->const_buffers[buf] =
 			build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
 
 		/* Load the constants, we rely on the code sinking to do the rest */
 		for (i = 0; i < num_const * 4; ++i) {
 			si_shader_ctx->constants[buf][i] =
 				buffer_load_const(gallivm->builder,
-					si_shader_ctx->const_resource[buf],
+					si_shader_ctx->const_buffers[buf],
 					lp_build_const_int32(gallivm, i * 4),
 					bld_base->base.elem_type);
 		}
@@ -3650,23 +3650,23 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx)
 	if (num_samplers == 0)
 		return;
 
-	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
-	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+	res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
+	samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
 
 	/* Load the resources and samplers, we rely on the code sinking to do the rest */
 	for (i = 0; i < num_samplers; ++i) {
 		/* Resource */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
+		si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
 
 		/* Sampler */
 		offset = lp_build_const_int32(gallivm, i);
-		si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
+		si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
 
 		/* FMASK resource */
 		if (info->is_msaa_sampler[i]) {
 			offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
-			si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] =
+			si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] =
 				build_indexed_load_const(si_shader_ctx, res_ptr, offset);
 		}
 	}
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index adcbb332e9d..443900b26ee 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -76,10 +76,10 @@ struct radeon_shader_binary;
 struct radeon_shader_reloc;
 
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
-#define SI_SGPR_CONST		2
-#define SI_SGPR_SAMPLER		4
-#define SI_SGPR_RESOURCE	6
-#define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
+#define SI_SGPR_CONST_BUFFERS	2
+#define SI_SGPR_SAMPLER_STATES	4
+#define SI_SGPR_SAMPLER_VIEWS	6
+#define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
 #define SI_SGPR_VS_STATE_BITS	12 /* VS(VS) only */
@@ -101,12 +101,12 @@ struct radeon_shader_reloc;
 
 /* LLVM function parameter indices */
 #define SI_PARAM_RW_BUFFERS	0
-#define SI_PARAM_CONST		1
-#define SI_PARAM_SAMPLER	2
-#define SI_PARAM_RESOURCE	3
+#define SI_PARAM_CONST_BUFFERS	1
+#define SI_PARAM_SAMPLER_STATES	2
+#define SI_PARAM_SAMPLER_VIEWS	3
 
 /* VS only parameters */
-#define SI_PARAM_VERTEX_BUFFER	4
+#define SI_PARAM_VERTEX_BUFFERS	4
 #define SI_PARAM_BASE_VERTEX	5
 #define SI_PARAM_START_INSTANCE	6
 /* [0] = clamp vertex color */

From 3ce0a2fd7f4270b036b4449a312cff8dcfd5925e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 22:22:24 +0100
Subject: [PATCH 063/241] radeonsi: pass TGSI processor type to si_compile_llvm
 for dumping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the parameter will be used later

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 2 +-
 src/gallium/drivers/radeonsi/si_shader.c  | 6 +++---
 src/gallium/drivers/radeonsi/si_shader.h  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 2565117581e..32e265e2a36 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -123,7 +123,7 @@ static void *si_create_compute_state(
 		        LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
                                                         code, header->num_bytes);
 			si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
-					mod, &sctx->b.debug);
+					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
 			LLVMDisposeModule(mod);
 		}
 	}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b453d97fa7e..c2e802e69f2 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3921,7 +3921,7 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod,
-		    struct pipe_debug_callback *debug)
+		    struct pipe_debug_callback *debug, unsigned processor)
 {
 	int r = 0;
 	bool dump_asm = r600_can_dump_shader(&sscreen->b,
@@ -4026,7 +4026,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 	r = si_compile_llvm(sscreen, si_shader_ctx->shader,
 			    si_shader_ctx->tm, bld_base->base.gallivm->module,
-			    debug);
+			    debug, TGSI_PROCESSOR_GEOMETRY);
 
 	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
 
@@ -4235,7 +4235,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
 	mod = bld_base->base.gallivm->module;
-	r = si_compile_llvm(sscreen, shader, tm, mod, debug);
+	r = si_compile_llvm(sscreen, shader, tm, mod, debug, si_shader_ctx.type);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 443900b26ee..b50b43c2390 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -332,7 +332,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod,
-		    struct pipe_debug_callback *debug);
+		    struct pipe_debug_callback *debug, unsigned processor);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);

From fd7000bd7897565a7a019a7ec3aacb832d696d22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 22:24:41 +0100
Subject: [PATCH 064/241] radeonsi: pass TGSI processor type to
 si_shader_binary_read for dumping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the parameter will be used later

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 3 ++-
 src/gallium/drivers/radeonsi/si_shader.c  | 4 ++--
 src/gallium/drivers/radeonsi/si_shader.h  | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 32e265e2a36..469e1cc0bcd 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -136,7 +136,8 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader, &sctx->b.debug);
+	si_shader_binary_read(sctx->screen, &program->shader, &sctx->b.debug,
+			      TGSI_PROCESSOR_COMPUTE);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index c2e802e69f2..fae90253afe 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3888,7 +3888,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 }
 
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-			  struct pipe_debug_callback *debug)
+			  struct pipe_debug_callback *debug, unsigned processor)
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
 	int r;
@@ -3940,7 +3940,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 			return r;
 	}
 
-	r = si_shader_binary_read(sscreen, shader, debug);
+	r = si_shader_binary_read(sscreen, shader, debug, processor);
 
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b50b43c2390..236148c34da 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -337,7 +337,7 @@ void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-			  struct pipe_debug_callback *debug);
+			  struct pipe_debug_callback *debug, unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);

From cd7f252b114b7771cde9bba698eddaa1b4b93ab4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 21:24:47 +0100
Subject: [PATCH 065/241] gallium/radeon: r600_can_dump_shader should get TGSI
 processor type directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/r600_shader.c        | 6 ++++--
 src/gallium/drivers/radeon/r600_pipe_common.c | 8 ++------
 src/gallium/drivers/radeon/r600_pipe_common.h | 2 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 9 +++------
 4 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 9c040aeec4a..c1565498ea5 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -162,7 +162,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_shader_selector *sel = shader->selector;
 	int r;
-	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
+	bool dump = r600_can_dump_shader(&rctx->screen->b,
+					 tgsi_get_processor_type(sel->tokens));
 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
 	unsigned export_shader;
@@ -3238,7 +3239,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (use_llvm) {
 		struct radeon_llvm_context radeon_llvm_ctx;
 		LLVMModuleRef mod;
-		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
+		bool dump = r600_can_dump_shader(&rscreen->b,
+						 tgsi_get_processor_type(tokens));
 		boolean use_kill = false;
 
 		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 4bfb3591c84..52c365e81d0 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -1011,13 +1011,9 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 }
 
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens)
+			  unsigned processor)
 {
-	/* Compute shader don't have tgsi_tokens */
-	if (!tokens)
-		return (rscreen->debug_flags & DBG_CS) != 0;
-
-	switch (tgsi_get_processor_type(tokens)) {
+	switch (processor) {
 	case TGSI_PROCESSOR_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
 	case TGSI_PROCESSOR_TESS_CTRL:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index a69e627a2e9..68b50a9fb0f 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -516,7 +516,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 void r600_common_context_cleanup(struct r600_common_context *rctx);
 void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r);
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-			  const struct tgsi_token *tokens);
+			  unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
 			      unsigned offset, unsigned size, unsigned value,
 			      bool is_framebuffer);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index fae90253afe..37a753c18f1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3892,15 +3892,13 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
 	int r;
-	bool dump  = r600_can_dump_shader(&sscreen->b,
-		shader->selector ? shader->selector->tokens : NULL);
 
 	si_shader_binary_read_config(sscreen, shader, 0);
 	r = si_shader_binary_upload(sscreen, shader);
 	if (r)
 		return r;
 
-	if (dump) {
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
 			si_shader_dump_disassembly(binary, debug);
 
@@ -3924,8 +3922,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor)
 {
 	int r = 0;
-	bool dump_asm = r600_can_dump_shader(&sscreen->b,
-				shader->selector ? shader->selector->tokens : NULL);
+	bool dump_asm = r600_can_dump_shader(&sscreen->b, processor);
 	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
 
@@ -4092,7 +4089,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	int r = 0;
 	bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
 			    shader->key.ps.poly_stipple;
-	bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens);
+	bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor);
 
 	if (poly_stipple) {
 		tokens = util_pstipple_create_fragment_shader(tokens, NULL,

From fb98acb5a156ce7b50938cc50fc61f8560c8f7f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 21:57:40 +0100
Subject: [PATCH 066/241] gallium/radeon: always add +DumpCode to the LLVM
 target machine for LLVM <= 3.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's the same behavior that we use for later LLVM.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/r600_llvm.c          | 2 +-
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 5 ++---
 src/gallium/drivers/radeon/radeon_llvm_emit.h | 2 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index ef2e2a2a117..b8a20b36af2 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -923,7 +923,7 @@ unsigned r600_llvm_compile(
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL, debug);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, NULL, debug);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index b765d367dab..408e8156106 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -172,7 +172,7 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     const char *gpu_family, bool dump_ir,
 			     LLVMTargetMachineRef tm,
 			     struct pipe_debug_callback *debug)
 {
@@ -199,8 +199,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 		}
 		strncpy(cpu, gpu_family, CPU_STRING_LEN);
 		memset(fs, 0, sizeof(fs));
-		if (dump_asm)
-			strncpy(fs, "+DumpCode", FS_STRING_LEN);
+		strncpy(fs, "+DumpCode", FS_STRING_LEN);
 		tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
 				  LLVMCodeGenLevelDefault, LLVMRelocDefault,
 						  LLVMCodeModelDefault);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index 29e4dc05a3d..4084740f457 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -39,7 +39,7 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     const char *gpu_family, bool dump_ir,
 			     LLVMTargetMachineRef tm,
 			     struct pipe_debug_callback *debug);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 37a753c18f1..4fdcd0dcaba 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3931,7 +3931,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 
 	if (!si_replace_shader(count, &shader->binary)) {
 		r = radeon_llvm_compile(mod, &shader->binary,
-			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm,
+			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, tm,
 			debug);
 		if (r)
 			return r;

From 7fa6bb47e3a19f4ad745f7a9fc77d24c7482d045 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 22:16:05 +0100
Subject: [PATCH 067/241] gallium/radeon: dump LLVM module outside of
 radeon_llvm_compile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/r600_llvm.c          |  4 +++-
 src/gallium/drivers/radeon/radeon_llvm_emit.c |  5 ++---
 src/gallium/drivers/radeon/radeon_llvm_emit.h |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c      | 10 ++++++----
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index b8a20b36af2..7f436067551 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -923,7 +923,9 @@ unsigned r600_llvm_compile(
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, NULL, debug);
+	if (dump)
+		LLVMDumpModule(mod);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 408e8156106..3d0987624a6 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -172,7 +172,7 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir,
+			     const char *gpu_family,
 			     LLVMTargetMachineRef tm,
 			     struct pipe_debug_callback *debug)
 {
@@ -205,8 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 						  LLVMCodeModelDefault);
 		dispose_tm = true;
 	}
-	if (dump_ir)
-		LLVMDumpModule(M);
+
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index 4084740f457..45f05a9e0e1 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -39,7 +39,7 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			     const char *gpu_family, bool dump_ir,
+			     const char *gpu_family,
 			     LLVMTargetMachineRef tm,
 			     struct pipe_debug_callback *debug);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4fdcd0dcaba..779550bfb95 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3922,16 +3922,18 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor)
 {
 	int r = 0;
-	bool dump_asm = r600_can_dump_shader(&sscreen->b, processor);
-	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
 
-	if (dump_ir || dump_asm)
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
+		if (!(sscreen->b.debug_flags & DBG_NO_IR))
+			LLVMDumpModule(mod);
+	}
+
 	if (!si_replace_shader(count, &shader->binary)) {
 		r = radeon_llvm_compile(mod, &shader->binary,
-			r600_get_llvm_processor_name(sscreen->b.family), dump_ir, tm,
+			r600_get_llvm_processor_name(sscreen->b.family), tm,
 			debug);
 		if (r)
 			return r;

From b6d95248f0c423dfde573617cffddb4fa2d3c8ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 23:22:14 +0100
Subject: [PATCH 068/241] radeonsi: move si_shader_binary_upload out of
 si_shader_binary_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  1 +
 src/gallium/drivers/radeonsi/si_shader.c  | 16 +++++++---------
 src/gallium/drivers/radeonsi/si_shader.h  |  4 ++--
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 469e1cc0bcd..3d10e0eeb5c 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -138,6 +138,7 @@ static void *si_create_compute_state(
 	init_scratch_buffer(sctx, program);
 	si_shader_binary_read(sctx->screen, &program->shader, &sctx->b.debug,
 			      TGSI_PROCESSOR_COMPUTE);
+	si_shader_binary_upload(sctx->screen, &program->shader);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 779550bfb95..c7e38b4eed4 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3887,16 +3887,12 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 	}
 }
 
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-			  struct pipe_debug_callback *debug, unsigned processor)
+void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
+			   struct pipe_debug_callback *debug, unsigned processor)
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
-	int r;
 
 	si_shader_binary_read_config(sscreen, shader, 0);
-	r = si_shader_binary_upload(sscreen, shader);
-	if (r)
-		return r;
 
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
@@ -3913,8 +3909,6 @@ int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
 			   shader->num_sgprs, shader->num_vgprs, binary->code_size,
 			   shader->lds_size, shader->scratch_bytes_per_wave);
-
-	return 0;
 }
 
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
@@ -3939,7 +3933,11 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 			return r;
 	}
 
-	r = si_shader_binary_read(sscreen, shader, debug, processor);
+	si_shader_binary_read(sscreen, shader, debug, processor);
+
+	r = si_shader_binary_upload(sscreen, shader);
+	if (r)
+		return r;
 
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 236148c34da..882f6980502 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -336,8 +336,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-			  struct pipe_debug_callback *debug, unsigned processor);
+void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
+			   struct pipe_debug_callback *debug, unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);

From 86fa48426cef42d7224139603b52a7d16bd35eb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Dec 2015 01:39:20 +0100
Subject: [PATCH 069/241] radeonsi: remove unused parameter from
 si_shader_binary_read_config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 5 ++---
 src/gallium/drivers/radeonsi/si_shader.c  | 7 +++----
 src/gallium/drivers/radeonsi/si_shader.h  | 5 ++---
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 3d10e0eeb5c..1c4d6b3683b 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -67,8 +67,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 				program->shader.binary.global_symbol_offsets[i];
 		unsigned scratch_bytes_needed;
 
-		si_shader_binary_read_config(sctx->screen,
-						&program->shader, offset);
+		si_shader_binary_read_config(&program->shader, offset);
 		scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
 		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
 	}
@@ -261,7 +260,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
 	/* Read the config information */
-	si_shader_binary_read_config(sctx->screen, shader, pc);
+	si_shader_binary_read_config(shader, pc);
 #endif
 
 	/* Upload the kernel arguments */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index c7e38b4eed4..426f40fe8f5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3741,9 +3741,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-				struct si_shader *shader,
-				unsigned symbol_offset)
+void si_shader_binary_read_config(struct si_shader *shader,
+				  unsigned symbol_offset)
 {
 	unsigned i;
 	const unsigned char *config =
@@ -3892,7 +3891,7 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
 
-	si_shader_binary_read_config(sscreen, shader, 0);
+	si_shader_binary_read_config(shader, 0);
 
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 882f6980502..d377a2a2cfd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -341,8 +341,7 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-				struct si_shader *shader,
-				unsigned symbol_offset);
+void si_shader_binary_read_config(struct si_shader *shader,
+				  unsigned symbol_offset);
 
 #endif

From 8cf2e892fca20c4776b4a07c39918343cb2d4e0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Samuel=20Iglesias=20Gons=C3=A1lvez?= <siglesias@igalia.com>
Date: Tue, 15 Dec 2015 12:51:48 +0100
Subject: [PATCH 070/241] i965/wm: use proper API buffer size for the surfaces.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 5bb5eeea fixes a bug indicating that the surfaces should have the
API buffer size. Hovewer it picked the wrong value.

This patch adds a new variable, which takes into account
glBindBufferRange() values. This patch fixes the following CTS
regressions:

ES31-CTS.shader_storage_buffer_object.advanced-unsizedArrayLength-cs-std430-vec-bindrangeOffset
ES31-CTS.shader_storage_buffer_object.advanced-unsizedArrayLength-cs-std430-vec-bindrangeSize

Signed-off-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
---
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 12 ++++++++----
 src/mesa/drivers/dri/i965/intel_buffer_objects.c |  1 +
 src/mesa/main/bufferobj.c                        |  4 +++-
 src/mesa/main/mtypes.h                           |  1 +
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 76dc5775121..7da4a404668 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -946,12 +946,14 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
+         GLsizeiptrARB size = MIN2(binding->BufferObject->BufferRangeSize,
+                                   binding->BufferObject->Size - binding->Offset);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
          brw_create_constant_surface(brw, bo, binding->Offset,
-                                     binding->BufferObject->Size - binding->Offset,
+                                     size,
                                      &ubo_surf_offsets[i]);
       }
    }
@@ -968,12 +970,14 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
+         GLsizeiptrARB size = MIN2(binding->BufferObject->BufferRangeSize,
+                                   binding->BufferObject->Size - binding->Offset);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
          brw_create_buffer_surface(brw, bo, binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset,
+                                   size,
                                    &ssbo_surf_offsets[i]);
       }
    }
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 7a5b3fca595..b26c939c214 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -198,6 +198,7 @@ brw_buffer_data(struct gl_context *ctx,
    (void) target;
 
    intel_obj->Base.Size = size;
+   intel_obj->Base.BufferRangeSize = size;
    intel_obj->Base.Usage = usage;
    intel_obj->Base.StorageFlags = storageFlags;
 
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 3a05cd55042..8a9f9b6a48d 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -43,7 +43,7 @@
 #include "glformats.h"
 #include "texstore.h"
 #include "transformfeedback.h"
-
+#include "macros.h"
 
 /* Debug flags */
 /*#define VBO_DEBUG*/
@@ -2840,6 +2840,7 @@ bind_buffer_range_uniform_buffer(struct gl_context *ctx,
 
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, bufObj);
    bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+   bufObj->BufferRangeSize = MIN2(size, bufObj->Size);
 }
 
 /**
@@ -2875,6 +2876,7 @@ bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
 
    _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
    bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+   bufObj->BufferRangeSize = MIN2(size, bufObj->Size);
 }
 
 /**
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5b9fce8b7cc..97116e400ce 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1268,6 +1268,7 @@ struct gl_buffer_object
    GLenum Usage;        /**< GL_STREAM_DRAW_ARB, GL_STREAM_READ_ARB, etc. */
    GLbitfield StorageFlags; /**< GL_MAP_PERSISTENT_BIT, etc. */
    GLsizeiptrARB Size;  /**< Size of buffer storage in bytes */
+   GLsizeiptrARB BufferRangeSize;  /**< Amount of data that can be read from or written to, in bytes */
    GLubyte *Data;       /**< Location of storage either in RAM or VRAM. */
    GLboolean DeletePending;   /**< true if buffer object is removed from the hash */
    GLboolean Written;   /**< Ever written to? (for debugging) */

From b5f2f7073f047b4e4128cf05af8dddf356f9b48c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 4 Jan 2016 11:16:45 -0500
Subject: [PATCH 071/241] nv50,nvc0: fix crash when increasing bsp bo size for
 h264

H264 doesn't have a bitplane bo. We just need a device reference, so use
the one from the client.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c | 4 ++--
 src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
index dbde1bfcebe..0f689cb2d6c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       bsp_size += (1 << 20) - 1;
       bsp_size &= ~((1 << 20) - 1);
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
       if (ret) {
          debug_printf("reallocating bsp %u -> %u failed with %i\n",
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
    if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
       struct nouveau_bo *tmp_bo = NULL;
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
       if (ret) {
          debug_printf("reallocating inter %u -> %u failed with %i\n",
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index 4392f62c530..5e8cda0768d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -81,7 +81,7 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       bsp_size += (1 << 20) - 1;
       bsp_size &= ~((1 << 20) - 1);
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
       if (ret) {
          debug_printf("reallocating bsp %u -> %u failed with %i\n",
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -98,7 +98,7 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       cfg.nvc0.tile_mode = 0x10;
       cfg.nvc0.memtype = 0xfe;
 
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
       if (ret) {
          debug_printf("reallocating inter %u -> %u failed with %i\n",
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);

From b16c9be4a5561bd825176a228c300331f989e837 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 4 Jan 2016 11:26:27 -0500
Subject: [PATCH 072/241] nvc0: scale up inter_bo size so that it's 16M for a
 4K video

Experimentally, 4M causes corruption and slowness, try to ramp it up
with size instead.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_video.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
index 48ffac1b715..5a946ca6314 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -169,9 +169,12 @@ nvc0_create_decoder(struct pipe_context *context,
    for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                            0, 1 << 20, &cfg, &dec->bsp_bo[i]);
-   if (!ret)
+   if (!ret) {
+      /* total fudge factor... just has to be bigger for higher bitrates? */
+      unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20);
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
-                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+                           0x100, inter_size, &cfg, &dec->inter_bo[0]);
+   }
    if (!ret) {
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                            0x100, dec->inter_bo[0]->size, &cfg,

From 2123bfcc9c8b375ba46bb59d493ddeac1b7291a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Mon, 4 Jan 2016 17:31:05 -0500
Subject: [PATCH 073/241] st/mesa: make KHR_debug output independent of context
 creation flags (v2)

Instead, keep track of GL_DEBUG_OUTPUT and (un)install the pipe_debug_callback
accordingly. Hardware drivers can still use the absence of the callback to
skip more expensive operations in the normal case, and users can no longer be
surprised by the need to set the debug flag at context creation time.

v2:
- re-add the proper initialization of debug contexts (Ilia Mirkin)
- silence a potential warning (Ilia Mirkin)

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/state_tracker/st_context.c | 18 ++++++++
 src/mesa/state_tracker/st_debug.c   | 72 +++++++++++++++++++++++++++++
 src/mesa/state_tracker/st_debug.h   |  4 ++
 src/mesa/state_tracker/st_manager.c | 61 ++----------------------
 4 files changed, 98 insertions(+), 57 deletions(-)

diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 724c3c5938a..31cc99dca89 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -79,6 +79,23 @@
 DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE)
 
 
+/**
+ * Called via ctx->Driver.Enable()
+ */
+static void st_Enable(struct gl_context * ctx, GLenum cap, GLboolean state)
+{
+   struct st_context *st = st_context(ctx);
+
+   switch (cap) {
+   case GL_DEBUG_OUTPUT:
+      st_enable_debug_output(st, state);
+      break;
+   default:
+      break;
+   }
+}
+
+
 /**
  * Called via ctx->Driver.UpdateState()
  */
@@ -457,5 +474,6 @@ void st_init_driver_functions(struct pipe_screen *screen,
 
    st_init_vdpau_functions(functions);
 
+   functions->Enable = st_Enable;
    functions->UpdateState = st_invalidate_state;
 }
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 6d859c6ab5b..134366db09d 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -104,3 +104,75 @@ st_print_current(void)
 }
 
 
+/**
+ * Installed as pipe_debug_callback when GL_DEBUG_OUTPUT is enabled.
+ */
+static void
+st_debug_message(void *data,
+                 unsigned *id,
+                 enum pipe_debug_type ptype,
+                 const char *fmt,
+                 va_list args)
+{
+   struct st_context *st = data;
+   enum mesa_debug_source source;
+   enum mesa_debug_type type;
+   enum mesa_debug_severity severity;
+
+   switch (ptype) {
+   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_ERROR:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_SHADER_INFO:
+      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_PERF_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_FALLBACK:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_CONFORMANCE:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   default:
+      unreachable("invalid debug type");
+   }
+   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
+}
+
+void
+st_enable_debug_output(struct st_context *st, boolean enable)
+{
+   struct pipe_context *pipe = st->pipe;
+
+   if (!pipe->set_debug_callback)
+      return;
+
+   if (enable) {
+      struct pipe_debug_callback cb = { st_debug_message, st };
+      pipe->set_debug_callback(pipe, &cb);
+   } else {
+      pipe->set_debug_callback(pipe, NULL);
+   }
+}
diff --git a/src/mesa/state_tracker/st_debug.h b/src/mesa/state_tracker/st_debug.h
index 288eccf9f9c..ed3ead82914 100644
--- a/src/mesa/state_tracker/st_debug.h
+++ b/src/mesa/state_tracker/st_debug.h
@@ -32,6 +32,8 @@
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
 
+struct st_context;
+
 extern void
 st_print_current(void);
 
@@ -59,6 +61,8 @@ extern int ST_DEBUG;
 
 void st_debug_init( void );
 
+void st_enable_debug_output(struct st_context *st, boolean enable);
+
 static inline void
 ST_DBG( unsigned flag, const char *fmt, ... )
 {
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index d0d261f4fde..385e26b946e 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -39,6 +39,7 @@
 #include "st_texture.h"
 
 #include "st_context.h"
+#include "st_debug.h"
 #include "st_extensions.h"
 #include "st_format.h"
 #include "st_cb_fbo.h"
@@ -623,58 +624,6 @@ st_context_destroy(struct st_context_iface *stctxi)
    st_destroy_context(st);
 }
 
-static void
-st_debug_message(void *data,
-                 unsigned *id,
-                 enum pipe_debug_type ptype,
-                 const char *fmt,
-                 va_list args)
-{
-   struct st_context *st = data;
-   enum mesa_debug_source source;
-   enum mesa_debug_type type;
-   enum mesa_debug_severity severity;
-
-   switch (ptype) {
-   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_ERROR;
-      severity = MESA_DEBUG_SEVERITY_MEDIUM;
-      break;
-   case PIPE_DEBUG_TYPE_ERROR:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_ERROR;
-      severity = MESA_DEBUG_SEVERITY_MEDIUM;
-      break;
-   case PIPE_DEBUG_TYPE_SHADER_INFO:
-      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_PERF_INFO:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_PERFORMANCE;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_INFO:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_FALLBACK:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_PERFORMANCE;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_CONFORMANCE:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   }
-   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
-}
-
 static struct st_context_iface *
 st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
                       const struct st_context_attribs *attribs,
@@ -723,17 +672,15 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
       return NULL;
    }
 
-   if (attribs->flags & ST_CONTEXT_FLAG_DEBUG){
+   if (attribs->flags & ST_CONTEXT_FLAG_DEBUG) {
       if (!_mesa_set_debug_state_int(st->ctx, GL_DEBUG_OUTPUT, GL_TRUE)) {
          *error = ST_CONTEXT_ERROR_NO_MEMORY;
          return NULL;
       }
+
       st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT;
 
-      if (pipe->set_debug_callback) {
-         struct pipe_debug_callback cb = { st_debug_message, st };
-         pipe->set_debug_callback(pipe, &cb);
-      }
+      st_enable_debug_output(st, TRUE);
    }
 
    if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)

From a1d664a0b7dbd8bbab0ff7a179e8960c25a52c0b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 4 Jan 2016 19:48:08 -0500
Subject: [PATCH 074/241] Revert "i965/wm: use proper API buffer size for the
 surfaces."
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 8cf2e892fca20c4776b4a07c39918343cb2d4e0e. It's
entirely bogus to attempt to store anything about the binding in the
buffer object itself, which might be bound any number of times.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Cc: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 12 ++++--------
 src/mesa/drivers/dri/i965/intel_buffer_objects.c |  1 -
 src/mesa/main/bufferobj.c                        |  4 +---
 src/mesa/main/mtypes.h                           |  1 -
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 7da4a404668..76dc5775121 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -946,14 +946,12 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
-         GLsizeiptrARB size = MIN2(binding->BufferObject->BufferRangeSize,
-                                   binding->BufferObject->Size - binding->Offset);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   size);
+                                   binding->BufferObject->Size - binding->Offset);
          brw_create_constant_surface(brw, bo, binding->Offset,
-                                     size,
+                                     binding->BufferObject->Size - binding->Offset,
                                      &ubo_surf_offsets[i]);
       }
    }
@@ -970,14 +968,12 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
-         GLsizeiptrARB size = MIN2(binding->BufferObject->BufferRangeSize,
-                                   binding->BufferObject->Size - binding->Offset);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   size);
+                                   binding->BufferObject->Size - binding->Offset);
          brw_create_buffer_surface(brw, bo, binding->Offset,
-                                   size,
+                                   binding->BufferObject->Size - binding->Offset,
                                    &ssbo_surf_offsets[i]);
       }
    }
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index b26c939c214..7a5b3fca595 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -198,7 +198,6 @@ brw_buffer_data(struct gl_context *ctx,
    (void) target;
 
    intel_obj->Base.Size = size;
-   intel_obj->Base.BufferRangeSize = size;
    intel_obj->Base.Usage = usage;
    intel_obj->Base.StorageFlags = storageFlags;
 
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 8a9f9b6a48d..3a05cd55042 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -43,7 +43,7 @@
 #include "glformats.h"
 #include "texstore.h"
 #include "transformfeedback.h"
-#include "macros.h"
+
 
 /* Debug flags */
 /*#define VBO_DEBUG*/
@@ -2840,7 +2840,6 @@ bind_buffer_range_uniform_buffer(struct gl_context *ctx,
 
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, bufObj);
    bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
-   bufObj->BufferRangeSize = MIN2(size, bufObj->Size);
 }
 
 /**
@@ -2876,7 +2875,6 @@ bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
 
    _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
    bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
-   bufObj->BufferRangeSize = MIN2(size, bufObj->Size);
 }
 
 /**
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 97116e400ce..5b9fce8b7cc 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1268,7 +1268,6 @@ struct gl_buffer_object
    GLenum Usage;        /**< GL_STREAM_DRAW_ARB, GL_STREAM_READ_ARB, etc. */
    GLbitfield StorageFlags; /**< GL_MAP_PERSISTENT_BIT, etc. */
    GLsizeiptrARB Size;  /**< Size of buffer storage in bytes */
-   GLsizeiptrARB BufferRangeSize;  /**< Amount of data that can be read from or written to, in bytes */
    GLubyte *Data;       /**< Location of storage either in RAM or VRAM. */
    GLboolean DeletePending;   /**< true if buffer object is removed from the hash */
    GLboolean Written;   /**< Ever written to? (for debugging) */

From 14f21f53d50ecd6e2803b2326a90e93aceacfe24 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 4 Jan 2016 19:57:11 -0500
Subject: [PATCH 075/241] i965/wm: use binding size for ubo/ssbo when automatic
 size is unset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the same tests that commit 8cf2e892f was attempting to fix:

ES31-CTS.shader_storage_buffer_object.advanced-unsizedArrayLength-cs-std430-vec-bindrangeOffset
ES31-CTS.shader_storage_buffer_object.advanced-unsizedArrayLength-cs-std430-vec-bindrangeSize

as confirmed by Samuel.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Cc: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 76dc5775121..5ab2f7f09df 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -946,12 +946,15 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
+         GLsizeiptr size = binding->BufferObject->Size - binding->Offset;
+         if (!binding->AutomaticSize)
+            size = MIN2(size, binding->Size);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
          brw_create_constant_surface(brw, bo, binding->Offset,
-                                     binding->BufferObject->Size - binding->Offset,
+                                     size,
                                      &ubo_surf_offsets[i]);
       }
    }
@@ -968,12 +971,15 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
       } else {
          struct intel_buffer_object *intel_bo =
             intel_buffer_object(binding->BufferObject);
+         GLsizeiptr size = binding->BufferObject->Size - binding->Offset;
+         if (!binding->AutomaticSize)
+            size = MIN2(size, binding->Size);
          drm_intel_bo *bo =
             intel_bufferobj_buffer(brw, intel_bo,
                                    binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
          brw_create_buffer_surface(brw, bo, binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset,
+                                   size,
                                    &ssbo_surf_offsets[i]);
       }
    }

From 851e7e12aa628d6781b5a3af2f2fc16ee73f435f Mon Sep 17 00:00:00 2001
From: Julien Isorce <j.isorce@samsung.com>
Date: Tue, 5 Jan 2016 15:02:47 +0000
Subject: [PATCH 076/241] st/va: count number of slices

The counter was not set but used by the nouveau driver.
It is required otherwise visual output is garbage.

Signed-off-by: Julien Isorce <j.isorce@samsung.com>
Reviewed-by: Christian Koenig <christian.koenig@amd.com>
---
 src/gallium/state_trackers/va/picture.c        | 8 ++++++++
 src/gallium/state_trackers/va/picture_h264.c   | 2 ++
 src/gallium/state_trackers/va/picture_mpeg12.c | 6 ++++++
 src/gallium/state_trackers/va/picture_vc1.c    | 7 +++++++
 src/gallium/state_trackers/va/va_private.h     | 2 ++
 5 files changed, 25 insertions(+)

diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index 7b30bf87d75..da9ca5aa6c9 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -174,6 +174,14 @@ static void
 handleSliceParameterBuffer(vlVaContext *context, vlVaBuffer *buf)
 {
    switch (u_reduce_video_profile(context->templat.profile)) {
+   case PIPE_VIDEO_FORMAT_MPEG12:
+      vlVaHandleSliceParameterBufferMPEG12(context, buf);
+      break;
+
+   case PIPE_VIDEO_FORMAT_VC1:
+      vlVaHandleSliceParameterBufferVC1(context, buf);
+      break;
+
    case PIPE_VIDEO_FORMAT_MPEG4_AVC:
       vlVaHandleSliceParameterBufferH264(context, buf);
       break;
diff --git a/src/gallium/state_trackers/va/picture_h264.c b/src/gallium/state_trackers/va/picture_h264.c
index acbfe5d61ed..883a94a2b52 100644
--- a/src/gallium/state_trackers/va/picture_h264.c
+++ b/src/gallium/state_trackers/va/picture_h264.c
@@ -48,6 +48,7 @@ void vlVaHandlePictureParameterBufferH264(vlVaDriver *drv, vlVaContext *context,
    unsigned i;
 
    assert(buf->size >= sizeof(VAPictureParameterBufferH264) && buf->num_elements == 1);
+   context->desc.h264.slice_count = 0;
    /*CurrPic*/
    context->desc.h264.field_order_cnt[0] = h264->CurrPic.TopFieldOrderCnt;
    context->desc.h264.field_order_cnt[1] = h264->CurrPic.BottomFieldOrderCnt;
@@ -162,6 +163,7 @@ void vlVaHandleSliceParameterBufferH264(vlVaContext *context, vlVaBuffer *buf)
    VASliceParameterBufferH264 *h264 = buf->data;
 
    assert(buf->size >= sizeof(VASliceParameterBufferH264) && buf->num_elements == 1);
+   context->desc.h264.slice_count += buf->num_elements;
    context->desc.h264.num_ref_idx_l0_active_minus1 =
       h264->num_ref_idx_l0_active_minus1;
    context->desc.h264.num_ref_idx_l1_active_minus1 =
diff --git a/src/gallium/state_trackers/va/picture_mpeg12.c b/src/gallium/state_trackers/va/picture_mpeg12.c
index e587b1e9c3f..812e9e5b2a9 100644
--- a/src/gallium/state_trackers/va/picture_mpeg12.c
+++ b/src/gallium/state_trackers/va/picture_mpeg12.c
@@ -32,6 +32,7 @@ void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *contex
    VAPictureParameterBufferMPEG2 *mpeg2 = buf->data;
 
    assert(buf->size >= sizeof(VAPictureParameterBufferMPEG2) && buf->num_elements == 1);
+   context->desc.mpeg12.num_slices = 0;
    /*horizontal_size;*/
    /*vertical_size;*/
    vlVaGetReferenceFrame(drv, mpeg2->forward_reference_picture, &context->desc.mpeg12.ref[0]);
@@ -78,3 +79,8 @@ void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
       context->desc.mpeg12.non_intra_matrix = NULL;
 }
 
+void vlVaHandleSliceParameterBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
+{
+   assert(buf->size >= sizeof(VASliceParameterBufferMPEG2) && buf->num_elements == 1);
+   context->desc.mpeg12.num_slices += buf->num_elements;
+}
diff --git a/src/gallium/state_trackers/va/picture_vc1.c b/src/gallium/state_trackers/va/picture_vc1.c
index f95fd8344f5..6ad1571ca96 100644
--- a/src/gallium/state_trackers/va/picture_vc1.c
+++ b/src/gallium/state_trackers/va/picture_vc1.c
@@ -32,6 +32,7 @@ void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context,
    VAPictureParameterBufferVC1 * vc1 = buf->data;
 
    assert(buf->size >= sizeof(VAPictureParameterBufferVC1) && buf->num_elements == 1);
+   context->desc.vc1.slice_count = 0;
    vlVaGetReferenceFrame(drv, vc1->forward_reference_picture, &context->desc.vc1.ref[0]);
    vlVaGetReferenceFrame(drv, vc1->backward_reference_picture, &context->desc.vc1.ref[1]);
    context->desc.vc1.picture_type = vc1->picture_fields.bits.picture_type;
@@ -65,3 +66,9 @@ void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context,
    context->desc.vc1.deblockEnable = vc1->post_processing != 0;
    context->desc.vc1.pquant = vc1->pic_quantizer_fields.bits.pic_quantizer_scale;
 }
+
+void vlVaHandleSliceParameterBufferVC1(vlVaContext *context, vlVaBuffer *buf)
+{
+   assert(buf->size >= sizeof(VASliceParameterBufferVC1) && buf->num_elements == 1);
+   context->desc.vc1.slice_count += buf->num_elements;
+}
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index fa6e0fb301e..bf9d24b2d34 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -351,10 +351,12 @@ VAStatus vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContex
 void vlVaGetReferenceFrame(vlVaDriver *drv, VASurfaceID surface_id, struct pipe_video_buffer **ref_frame);
 void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf);
+void vlVaHandleSliceParameterBufferMPEG12(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandlePictureParameterBufferH264(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleIQMatrixBufferH264(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleSliceParameterBufferH264(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
+void vlVaHandleSliceParameterBufferVC1(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandlePictureParameterBufferMPEG4(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleIQMatrixBufferMPEG4(vlVaContext *context, vlVaBuffer *buf);
 void vlVaHandleSliceParameterBufferMPEG4(vlVaContext *context, vlVaBuffer *buf);

From 7ba27f60f71accaf59f267ff20580444e912ab2b Mon Sep 17 00:00:00 2001
From: Julien Isorce <julien.isorce@gmail.com>
Date: Wed, 23 Dec 2015 09:25:52 +0000
Subject: [PATCH 077/241] nouveau: split nouveau_vp3_bsp in begin/next/end

It allows to call nouveau_vp3_bsp_next multiple times
between one begin/end.

It is required to support st/va.

https://bugs.freedesktop.org/show_bug.cgi?id=89969

Signed-off-by: Julien Isorce <j.isorce@samsung.com>
[imirkin: create strparm_bsp function, simplified w0 calculation]
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nouveau_vp3_video.h       | 17 +++-
 .../drivers/nouveau/nouveau_vp3_video_bsp.c   | 93 ++++++++++++-------
 .../drivers/nouveau/nv50/nv98_video_bsp.c     |  5 +-
 .../drivers/nouveau/nvc0/nvc0_video_bsp.c     |  5 +-
 4 files changed, 78 insertions(+), 42 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 58df5ee847f..809e971a678 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -114,6 +114,11 @@ struct nouveau_vp3_decoder {
    unsigned fence_seq, fw_sizes, last_frame_num, tmp_stride, ref_stride;
 
    unsigned bsp_idx, vp_idx, ppp_idx;
+
+   /* End of the bsp bo where new data should be appended between one begin/end
+    * frame.
+    */
+   char *bsp_ptr;
 };
 
 struct comm {
@@ -208,11 +213,15 @@ nouveau_vp3_load_firmware(struct nouveau_vp3_decoder *dec,
                           enum pipe_video_profile profile,
                           unsigned chipset);
 
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec);
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes);
+
 uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes);
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc);
 
 void
 nouveau_vp3_vp_caps(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
index 692772e49d1..a3d07deeb18 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@@ -230,20 +230,58 @@ nouveau_vp3_fill_picparm_h264_bsp(struct nouveau_vp3_decoder *dec,
    return caps | 3;
 }
 
-uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes)
+static inline struct strparm_bsp *strparm_bsp(struct nouveau_vp3_decoder *dec)
 {
-   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   unsigned comm_seq = dec->fence_seq;
    struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   char *bsp;
-   uint32_t endmarker, caps;
-   struct strparm_bsp *str_bsp;
+   return (struct strparm_bsp *)(bsp_bo->map + 0x100);
+}
+
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec)
+{
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+
+   dec->bsp_ptr = (void *)str_bsp;
+   memset(str_bsp, 0, 0x80);
+   dec->bsp_ptr += 0x100;
+   /* Reserved for picparm_vp */
+   dec->bsp_ptr += 0x300;
+   /* Reserved for comm */
+#if !NOUVEAU_VP3_DEBUG_FENCE
+   memset(dec->bsp_ptr, 0, 0x200);
+#endif
+   dec->bsp_ptr += 0x200;
+}
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes)
+{
+#ifndef NDEBUG
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+#endif
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
    int i;
 
-   bsp = bsp_bo->map;
+   for (i = 0; i < num_buffers; ++i) {
+      assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]);
+      memcpy(dec->bsp_ptr, data[i], num_bytes[i]);
+      dec->bsp_ptr += num_bytes[i];
+      str_bsp->w0[0] += num_bytes[i];
+   }
+}
+
+uint32_t
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc)
+{
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   uint32_t endmarker, caps;
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   char *bsp = bsp_bo->map;
    /*
     * 0x000..0x100: picparm_bsp
     * 0x200..0x500: picparm_vp
@@ -277,34 +315,21 @@ nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
    caps |= 1 << 17; // enable watchdog
    caps |= 0 << 18; // do not report error to VP, so it can continue decoding what we have
    caps |= 0 << 19; // if enabled, use crypto crap?
-   bsp += 0x100;
 
-   str_bsp = (struct strparm_bsp *)bsp;
-   memset(str_bsp, 0, 0x80);
-   str_bsp->w0[0] = 16;
+   str_bsp = strparm_bsp(dec);
    str_bsp->w1[0] = 0x1;
-   bsp += 0x100;
-   /* Reserved for picparm_vp */
-   bsp += 0x300;
-   /* Reserved for comm */
-#if !NOUVEAU_VP3_DEBUG_FENCE
-   memset(bsp, 0, 0x200);
-#endif
-   bsp += 0x200;
-   for (i = 0; i < num_buffers; ++i) {
-      memcpy(bsp, data[i], num_bytes[i]);
-      bsp += num_bytes[i];
-      str_bsp->w0[0] += num_bytes[i];
-   }
 
    /* Append end sequence */
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
-   bsp += 4;
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   str_bsp->w0[0] += 16;
+
+   dec->bsp_ptr = NULL;
 
    return caps;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
index 0f689cb2d6c..4fe0e05c96b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -106,8 +106,9 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       return -1;
    }
 
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   nouveau_vp3_bsp_begin(dec);
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index 5e8cda0768d..6cedeaf9f27 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -114,8 +114,9 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
       return -1;
    }
 
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   nouveau_vp3_bsp_begin(dec);
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
 

From abb30b9c8bf3d9a40440a24b483b42057e36d844 Mon Sep 17 00:00:00 2001
From: Julien Isorce <julien.isorce@gmail.com>
Date: Wed, 23 Dec 2015 09:25:53 +0000
Subject: [PATCH 078/241] nvc0: add support for st/va

- split nvc0_decoder_bsp in begin/next/end
- preserve content buffer when calling nvc0_decoder_bsp_next
- implement pipe_video_codec::begin_frame/end_frame

https://bugs.freedesktop.org/show_bug.cgi?id=89969

Signed-off-by: Julien Isorce <j.isorce@samsung.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_video.c |  44 ++++++-
 src/gallium/drivers/nouveau/nvc0/nvc0_video.h |  18 ++-
 .../drivers/nouveau/nvc0/nvc0_video_bsp.c     | 123 ++++++++++++------
 3 files changed, 134 insertions(+), 51 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
index 5a946ca6314..a9fd1d20942 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -25,6 +25,24 @@
 #include "util/u_sampler.h"
 #include "util/u_format.h"
 
+static void
+nvc0_decoder_begin_frame(struct pipe_video_codec *decoder,
+                         struct pipe_video_buffer *target,
+                         struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = ++dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(dec);
+   assert(target);
+   assert(target->buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nvc0_decoder_bsp_begin(dec, comm_seq);
+
+   assert(ret == 2);
+}
+
 static void
 nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               struct pipe_video_buffer *video_target,
@@ -32,10 +50,26 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               unsigned num_buffers,
                               const void *const *data,
                               const unsigned *num_bytes)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(decoder);
+
+   ret = nvc0_decoder_bsp_next(dec, comm_seq, num_buffers, data, num_bytes);
+
+   assert(ret == 2);
+}
+
+static void
+nvc0_decoder_end_frame(struct pipe_video_codec *decoder,
+                       struct pipe_video_buffer *video_target,
+                       struct pipe_picture_desc *picture)
 {
    struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
    struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target;
-   uint32_t comm_seq = ++dec->fence_seq;
+   uint32_t comm_seq = dec->fence_seq;
    union pipe_desc desc;
 
    unsigned vp_caps, is_ref, ret;
@@ -43,11 +77,7 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
 
    desc.base = picture;
 
-   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
-
-   ret = nvc0_decoder_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes,
-                          &vp_caps, &is_ref, refs);
+   ret = nvc0_decoder_bsp_end(dec, desc, target, comm_seq, &vp_caps, &is_ref, refs);
 
    /* did we decode bitstream correctly? */
    assert(ret == 2);
@@ -164,7 +194,9 @@ nvc0_create_decoder(struct pipe_context *context,
    PUSH_DATA (push[2], dec->ppp->handle);
 
    dec->base.context = context;
+   dec->base.begin_frame = nvc0_decoder_begin_frame;
    dec->base.decode_bitstream = nvc0_decoder_decode_bitstream;
+   dec->base.end_frame = nvc0_decoder_end_frame;
 
    for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
       ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
index 9ee0280f8ea..cf3c942355b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
@@ -30,12 +30,18 @@
 #include "util/u_video.h"
 
 extern unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16]);
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq);
+
+extern unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes);
+
+extern unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target,
+                     unsigned comm_seq, unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16]);
 
 extern void
 nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index 6cedeaf9f27..c53f946a762 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -32,40 +32,34 @@ static void dump_comm_bsp(struct comm *comm)
 #endif
 
 unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16])
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq)
+{
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   unsigned ret = 0;
+
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
+
+   nouveau_vp3_bsp_begin(dec);
+
+   return 2;
+}
+
+unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes)
 {
-   struct nouveau_pushbuf *push = dec->pushbuf[0];
-   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
-   uint32_t bsp_addr, comm_addr, inter_addr;
-   uint32_t slice_size, bucket_size, ring_size, bsp_size;
-   uint32_t caps, i;
-   int ret;
    struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
    struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   unsigned fence_extra = 0;
-   struct nouveau_pushbuf_refn bo_refs[] = {
-      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
-      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
-#if NOUVEAU_VP3_DEBUG_FENCE
-      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
-#endif
-      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
-   };
-   int num_refs = ARRAY_SIZE(bo_refs);
+   uint32_t bsp_size = 0;
+   uint32_t i = 0;
+   unsigned ret = 0;
 
-   if (!dec->bitplane_bo)
-      num_refs--;
-
-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
-   bsp_size = NOUVEAU_VP3_BSP_RESERVED_SIZE;
+   bsp_size = dec->bsp_ptr - (char *)bsp_bo->map;
    for (i = 0; i < num_buffers; i++)
       bsp_size += num_bytes[i];
    bsp_size += 256; /* the 4 end markers */
@@ -87,8 +81,23 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
                       bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
          return -1;
       }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
+      /* Preserve previous buffer. */
+      /* TODO: offload this copy to the GPU, as otherwise we're reading and
+       * writing to VRAM. */
+      memcpy(tmp_bo->map, bsp_bo->map, bsp_bo->size);
+
+      /* update position to current chunk */
+      dec->bsp_ptr = tmp_bo->map + (dec->bsp_ptr - (char *)bsp_bo->map);
+
       nouveau_bo_ref(NULL, &bsp_bo);
-      bo_refs[0].bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
+      dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
    }
 
    if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
@@ -104,18 +113,54 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
                       inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
          return -1;
       }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
       nouveau_bo_ref(NULL, &inter_bo);
-      bo_refs[1].bo = dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
+      dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
    }
 
-   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
-   if (ret) {
-      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
-      return -1;
-   }
-
-   nouveau_vp3_bsp_begin(dec);
    nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+
+   return 2;
+}
+
+
+unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                     unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t caps;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = ARRAY_SIZE(bo_refs);
+
+   if (!dec->bitplane_bo)
+      num_refs--;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
    caps = nouveau_vp3_bsp_end(dec, desc);
 
    nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);

From 777d1453f1053af7e051fa701b9440061bc27dce Mon Sep 17 00:00:00 2001
From: Julien Isorce <j.isorce@samsung.com>
Date: Thu, 9 Apr 2015 13:45:17 +0100
Subject: [PATCH 079/241] build: enable st/va with nouveau driver

vainfo fails in vaDriverInit because "dd_create_screen"
does not reach strcmp(driver_name, "nouveau") code.
Indeed when compiling the va target.c, the macro GALLIUM_NOUVEAU
is not defined.
This patch define the macro the same it is done for dri and
vdpau targets.

Tested with:
./autogen.sh --enable-glx --enable-gles2 --enable-egl --enable-vdpau --enable-glx-tls=yes --enable-va
--with-gallium-drivers=swrast,nouveau --with-dri-drivers=swrast,nouveau --with-egl-platforms=x11

LIBVA_DRIVER_NAME=gallium vainfo
Output:
vainfo: Driver version: mesa gallium vaapi
vainfo: Supported profile and entrypoints
VAProfileMPEG2Simple                  :	VAEntrypointVLD
      VAProfileMPEG2Main              :	VAEntrypointVLD
      VAProfileMPEG4Simple            :	VAEntrypointVLD
      VAProfileMPEG4AdvancedSimple    :	VAEntrypointVLD
      VAProfileVC1Simple              :	VAEntrypointVLD
      VAProfileVC1Main                :	VAEntrypointVLD
      VAProfileVC1Advanced            :	VAEntrypointVLD
      VAProfileH264Baseline           :	VAEntrypointVLD
      VAProfileH264Main               :	VAEntrypointVLD
      VAProfileH264High               :	VAEntrypointVLD
      VAProfileNone                   :	VAEntrypointVideoProc

Signed-off-by: Julien Isorce <j.isorce@samsung.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/targets/va/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/targets/va/Makefile.am b/src/gallium/targets/va/Makefile.am
index 733e7acb455..1edd5c2a5c6 100644
--- a/src/gallium/targets/va/Makefile.am
+++ b/src/gallium/targets/va/Makefile.am
@@ -42,6 +42,8 @@ TARGET_DRIVERS =
 TARGET_CPPFLAGS =
 TARGET_LIB_DEPS =
 
+include $(top_srcdir)/src/gallium/drivers/nouveau/Automake.inc
+
 include $(top_srcdir)/src/gallium/drivers/r600/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/radeonsi/Automake.inc
 

From 6531ccb7056f80c32a29e07fe05381b0fd6557dc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 4 Jan 2016 23:28:52 -0500
Subject: [PATCH 080/241] i965: quieten compiler warning about out-of-bounds
 access

gcc 4.9.3 shows the following error:

brw_vue_map.c:260:20: warning: array subscript is above array bounds
[-Warray-bounds]
    return brw_names[slot - VARYING_SLOT_MAX];

This is because BRW_VARYING_SLOT_COUNT is a valid value for the enum
type. Adding an assert will generate no additional code but will teach
the compiler to not complain.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 src/mesa/drivers/dri/i965/brw_vue_map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 09eadbcb54f..fea24368e8c 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -257,6 +257,7 @@ varying_name(brw_varying_slot slot)
       [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
    };
 
+   assert(slot < BRW_VARYING_SLOT_COUNT);
    return brw_names[slot - VARYING_SLOT_MAX];
 }
 

From b11bd20889bf7c5a728e69abecb6f10c4475d732 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 081/241] svga: check for no-ops in svga_bind_sampler_states()

and svga_set_sampler_views().  If there's no change, return early
and don't set a SVGA_NEW_x dirty state flag.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_pipe_sampler.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 95241176510..3e778f0a087 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -287,6 +287,7 @@ svga_bind_sampler_states(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
    unsigned i;
+   boolean any_change = FALSE;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(start + num <= PIPE_MAX_SAMPLERS);
@@ -295,8 +296,15 @@ svga_bind_sampler_states(struct pipe_context *pipe,
    if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT)
       return;
 
-   for (i = 0; i < num; i++)
+   for (i = 0; i < num; i++) {
+      if (svga->curr.sampler[shader][start + i] != samplers[i])
+         any_change = TRUE;
       svga->curr.sampler[shader][start + i] = samplers[i];
+   }
+
+   if (!any_change) {
+      return;
+   }
 
    /* find highest non-null sampler[] entry */
    {
@@ -405,6 +413,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
    unsigned flag_1d = 0;
    unsigned flag_srgb = 0;
    uint i;
+   boolean any_change = FALSE;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(start + num <= Elements(svga->curr.sampler_views[shader]));
@@ -422,6 +431,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
          pipe_sampler_view_release(pipe, &svga->curr.sampler_views[shader][start + i]);
          pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i],
                                      views[i]);
+         any_change = TRUE;
       }
 
       if (!views[i])
@@ -434,6 +444,10 @@ svga_set_sampler_views(struct pipe_context *pipe,
          flag_1d |= 1 << (start + i);
    }
 
+   if (!any_change) {
+      return;
+   }
+
    /* find highest non-null sampler_views[] entry */
    {
       unsigned j = MAX2(svga->curr.num_sampler_views[shader], start + num);

From 077aa3be93af353b6e1fcc8de62bfc37a574ec52 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 082/241] svga: avoid emitting redundant SetVertexBuffers()
 commands

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_context.h |  5 +++++
 src/gallium/drivers/svga/svga_draw.c    | 26 ++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index c282932cb18..d21b0716bc6 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -345,6 +345,11 @@ struct svga_hw_draw_state
    SVGA3dElementLayoutId layout_id;
    SVGA3dPrimitiveType topology;
 
+   /** Vertex buffer state */
+   SVGA3dVertexBuffer vbuffers[PIPE_MAX_ATTRIBS];
+   struct svga_winsys_surface *vbuffer_handles[PIPE_MAX_ATTRIBS];
+   unsigned num_vbuffers;
+
    struct svga_winsys_surface *ib;  /**< index buffer for drawing */
    SVGA3dSurfaceFormat ib_format;
    unsigned ib_offset;
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 2d3631d6f9c..d4c9914afbd 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -517,11 +517,27 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
          buffers[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
       }
       if (vbuf_count > 0) {
-         ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
-                                              0,    /* startBuffer */
-                                              buffers, vb_handle);
-         if (ret != PIPE_OK)
-            return ret;
+         /* If we haven't yet emitted a drawing command or if any
+          * vertex buffer state is changing, issue that state now.
+          */
+         if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) == 0) ||
+             vbuf_count != svga->state.hw_draw.num_vbuffers ||
+             memcmp(buffers, svga->state.hw_draw.vbuffers,
+                    vbuf_count * sizeof(buffers[0])) ||
+             memcmp(vb_handle, svga->state.hw_draw.vbuffer_handles,
+                    vbuf_count * sizeof(vb_handle[0]))) {
+            ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
+                                                 0,    /* startBuffer */
+                                                 buffers, vb_handle);
+            if (ret != PIPE_OK)
+               return ret;
+
+            svga->state.hw_draw.num_vbuffers = vbuf_count;
+            memcpy(svga->state.hw_draw.vbuffers, buffers,
+                   vbuf_count * sizeof(buffers[0]));
+            memcpy(svga->state.hw_draw.vbuffer_handles, vb_handle,
+                   vbuf_count * sizeof(vb_handle[0]));
+         }
       }
    }
 

From fc076588956d5b966913e7f8e47cbcb4bc5cd96b Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 083/241] svga: change svga_hw_view_state::dirty to boolean

Since it's a true/false value.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index d21b0716bc6..e4f29b8497e 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -314,7 +314,7 @@ struct svga_hw_view_state
    struct svga_sampler_view *v;
    unsigned min_lod;
    unsigned max_lod;
-   int dirty;
+   boolean dirty;
 };
 
 /* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW )

From 993b04ee2c415e15f89e2717cfeff90225452937 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 084/241] svga: add some comments in svga_state_vs.c

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_state_vs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 24574c1bf85..a103dab25fe 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -173,8 +173,11 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
       return;
    }
 
+   /* SVGA_NEW_PRESCALE */
    key->vs.need_prescale = svga->state.hw_clear.prescale.enabled &&
                            (svga->curr.gs == NULL);
+
+   /* SVGA_NEW_RAST */
    key->vs.allow_psiz = svga->curr.rast->templ.point_size_per_vertex;
 
    /* SVGA_NEW_FS */

From eec8d7e7e059c19b86cce0360cb7db28aef1f1dd Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 085/241] svga: fix test for SVGA_NEW_STIPPLE

We only want to set the SVGA_NEW_STIPPLE dirty flag when the polygon
stipple state changes.  Before, we only set the flag when we were
enabling stipple, but not disabling.

We don't really have to add SVGA_NEW_STIPPLE to the dirty FS state
set since it's a subset of SVGA_NEW_RAST, but let's be explicit.

This doesn't fix any known bugs.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_pipe_rasterizer.c | 11 +++++++----
 src/gallium/drivers/svga/svga_state_fs.c        |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index fa1744fc33e..8e0db539574 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -368,13 +368,16 @@ static void svga_bind_rasterizer_state( struct pipe_context *pipe,
    struct svga_context *svga = svga_context(pipe);
    struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;
 
+   if (!raster ||
+       !svga->curr.rast ||
+       raster->templ.poly_stipple_enable !=
+       svga->curr.rast->templ.poly_stipple_enable) {
+      svga->dirty |= SVGA_NEW_STIPPLE;
+   }
+
    svga->curr.rast = raster;
 
    svga->dirty |= SVGA_NEW_RAST;
-
-   if (raster && raster->templ.poly_stipple_enable) {
-      svga->dirty |= SVGA_NEW_STIPPLE;
-   }
 }
 
 static void
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index e392778c2fb..bac91669be1 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -452,6 +452,7 @@ struct svga_tracked_state svga_hw_fs =
     SVGA_NEW_TEXTURE_BINDING |
     SVGA_NEW_NEED_SWTNL |
     SVGA_NEW_RAST |
+    SVGA_NEW_STIPPLE |
     SVGA_NEW_REDUCED_PRIMITIVE |
     SVGA_NEW_SAMPLER |
     SVGA_NEW_FRAME_BUFFER |

From f04d7439a0ad6e13ff2912ff824553b6bcf511a4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 086/241] mesa: check for z=0 in _mesa_Vertex3dv()

It's very rare that a GL app calls glVertex3dv(), but one in particular
calls it lot, always with Z = 0.  Check for that condition and convert
the call into glVertex2f.  This reduces VBO memory used and reduces
the number of times we have to switch between float[2] and float[3]
vertex formats in the svga driver.  This results in a small but
measurable performance improvement.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/main/api_loopback.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index a7fd82c531f..8b63d9c0e95 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -629,7 +629,10 @@ _mesa_Vertex2sv( const GLshort *v )
 void GLAPIENTRY
 _mesa_Vertex3dv( const GLdouble *v )
 {
-   VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] );
+   if (v[2] == 0.0)
+      VERTEX2( (GLfloat) v[0], (GLfloat) v[1] );
+   else
+      VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] );
 }
 
 void GLAPIENTRY

From 95d412181d1bc32fb94df92198e3fd0a4a368ac4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:04 -0700
Subject: [PATCH 087/241] util: add debug_dump_ubyte_rgba_bmp()

Like debug_dump_float_rgba_bmp() but takes ubyte values.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/auxiliary/util/u_debug.c | 59 ++++++++++++++++++++++++++++
 src/gallium/auxiliary/util/u_debug.h |  4 ++
 2 files changed, 63 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index cb162d89a58..2b605594a2e 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -727,6 +727,65 @@ error1:
    ;
 }
 
+void
+debug_dump_ubyte_rgba_bmp(const char *filename,
+                          unsigned width, unsigned height,
+                          const ubyte *rgba, unsigned stride)
+{
+   FILE *stream;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   unsigned x, y;
+
+   assert(rgba);
+   if(!rgba)
+      goto error1;
+
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + height*width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+
+   bmih.biSize = 40;
+   bmih.biWidth = width;
+   bmih.biHeight = height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = height*width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+
+   stream = fopen(filename, "wb");
+   assert(stream);
+   if(!stream)
+      goto error1;
+
+   fwrite(&bmfh, 14, 1, stream);
+   fwrite(&bmih, 40, 1, stream);
+
+   y = height;
+   while(y--) {
+      const ubyte *ptr = rgba + (stride * y * 4);
+      for(x = 0; x < width; ++x)
+      {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = ptr[x*4 + 0];
+         pixel.rgbGreen = ptr[x*4 + 1];
+         pixel.rgbBlue  = ptr[x*4 + 2];
+         pixel.rgbAlpha = ptr[x*4 + 3];
+         fwrite(&pixel, 1, 4, stream);
+      }
+   }
+
+   fclose(stream);
+error1:
+   ;
+}
+
 
 /**
  * Print PIPE_TRANSFER_x flags with a message.
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 34668f844e9..671bd37a085 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -490,12 +490,16 @@ void debug_dump_transfer_bmp(struct pipe_context *pipe,
 void debug_dump_float_rgba_bmp(const char *filename,
                                unsigned width, unsigned height,
                                float *rgba, unsigned stride);
+void debug_dump_ubyte_rgba_bmp(const char *filename,
+                               unsigned width, unsigned height,
+                               const ubyte *rgba, unsigned stride);
 #else
 #define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
 #define debug_dump_surface(pipe, prefix, surface) ((void)0)
 #define debug_dump_surface_bmp(pipe, filename, surface) ((void)0)
 #define debug_dump_transfer_bmp(filename, transfer, ptr) ((void)0)
 #define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
+#define debug_dump_ubyte_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
 #endif
 
 

From dce1e1a8eb871945f875c17d0e9b687a29835530 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:05 -0700
Subject: [PATCH 088/241] mesa: minor clean-up of some memcpy/sizeof() calls in
 m_matrix.c

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/math/m_matrix.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c
index 6522200b345..b3cfcd26a14 100644
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -654,7 +654,7 @@ static GLboolean invert_matrix_3d_no_rot( GLmatrix *mat )
    if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0 || MAT(in,2,2) == 0 )
       return GL_FALSE;
 
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
    MAT(out,0,0) = 1.0F / MAT(in,0,0);
    MAT(out,1,1) = 1.0F / MAT(in,1,1);
    MAT(out,2,2) = 1.0F / MAT(in,2,2);
@@ -687,7 +687,7 @@ static GLboolean invert_matrix_2d_no_rot( GLmatrix *mat )
    if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0)
       return GL_FALSE;
 
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
    MAT(out,0,0) = 1.0F / MAT(in,0,0);
    MAT(out,1,1) = 1.0F / MAT(in,1,1);
 
@@ -709,7 +709,7 @@ static GLboolean invert_matrix_perspective( GLmatrix *mat )
    if (MAT(in,2,3) == 0)
       return GL_FALSE;
 
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
 
    MAT(out,0,0) = 1.0F / MAT(in,0,0);
    MAT(out,1,1) = 1.0F / MAT(in,1,1);
@@ -802,7 +802,7 @@ _math_matrix_rotate( GLmatrix *mat,
    s = sinf( angle * M_PI / 180.0 );
    c = cosf( angle * M_PI / 180.0 );
 
-   memcpy(m, Identity, sizeof(GLfloat)*16);
+   memcpy(m, Identity, sizeof(Identity));
    optimized = GL_FALSE;
 
 #define M(row,col)  m[col*4+row]
@@ -1136,8 +1136,8 @@ _math_matrix_viewport(GLmatrix *m, const float scale[3],
 void
 _math_matrix_set_identity( GLmatrix *mat )
 {
-   memcpy( mat->m, Identity, 16*sizeof(GLfloat) );
-   memcpy( mat->inv, Identity, 16*sizeof(GLfloat) );
+   memcpy( mat->m, Identity, sizeof(Identity) );
+   memcpy( mat->inv, Identity, sizeof(Identity) );
 
    mat->type = MATRIX_IDENTITY;
    mat->flags &= ~(MAT_DIRTY_FLAGS|
@@ -1437,7 +1437,7 @@ _math_matrix_is_dirty( const GLmatrix *m )
 void
 _math_matrix_copy( GLmatrix *to, const GLmatrix *from )
 {
-   memcpy( to->m, from->m, sizeof(Identity) );
+   memcpy(to->m, from->m, 16 * sizeof(GLfloat));
    memcpy(to->inv, from->inv, 16 * sizeof(GLfloat));
    to->flags = from->flags;
    to->type = from->type;

From f4caa7d2fcbfd64d8858a502d1deed8e56109464 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:03:05 -0700
Subject: [PATCH 089/241] draw: minor indentation fix

---
 src/gallium/auxiliary/draw/draw_pt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 0204b439dee..5a49acb64d3 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -524,7 +524,7 @@ draw_vbo(struct draw_context *draw,
 #endif
    {
       if (index_limit == 0) {
-      /* one of the buffers is too small to do any valid drawing */
+         /* one of the buffers is too small to do any valid drawing */
          debug_warning("draw: VBO too small to draw anything\n");
          util_fpstate_set(fpstate);
          return;

From a13e9adbee2f273ced137f96b770aff7181b363c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 13:04:46 -0700
Subject: [PATCH 090/241] st/mesa: minor indentation fixes

---
 src/mesa/state_tracker/st_atom_rasterizer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 55d5e66243c..c20cadf508f 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -220,13 +220,13 @@ static void update_raster_state( struct st_context *st )
    raster->line_smooth = ctx->Line.SmoothFlag;
    if (ctx->Line.SmoothFlag) {
       raster->line_width = CLAMP(ctx->Line.Width,
-                                ctx->Const.MinLineWidthAA,
-                                ctx->Const.MaxLineWidthAA);
+                                 ctx->Const.MinLineWidthAA,
+                                 ctx->Const.MaxLineWidthAA);
    }
    else {
       raster->line_width = CLAMP(ctx->Line.Width,
-                                ctx->Const.MinLineWidth,
-                                ctx->Const.MaxLineWidth);
+                                 ctx->Const.MinLineWidth,
+                                 ctx->Const.MaxLineWidth);
    }
 
    raster->line_stipple_enable = ctx->Line.StippleFlag;

From ee4676aa57a0b1d4ac7e2f30fd45c36807fe35ea Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 17 Dec 2015 17:50:34 -0800
Subject: [PATCH 091/241] i915/i965: Fix typo in perf_debug message

Trivial

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/drivers/dri/i915/intel_pixel_copy.c | 2 +-
 src/mesa/drivers/dri/i965/intel_pixel_copy.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/intel_pixel_copy.c b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
index a7185564e47..213cdbd0f53 100644
--- a/src/mesa/drivers/dri/i915/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
@@ -138,7 +138,7 @@ do_blit_copypixels(struct gl_context * ctx,
    }
 
    if (ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F) {
-      perf_debug("glCopyPixles(): Unsupported pixel zoom\n");
+      perf_debug("glCopyPixels(): Unsupported pixel zoom\n");
       return false;
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
index 3b5bdb8f928..05c35bd61b3 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -142,7 +142,7 @@ do_blit_copypixels(struct gl_context * ctx,
    }
 
    if (ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F) {
-      perf_debug("glCopyPixles(): Unsupported pixel zoom\n");
+      perf_debug("glCopyPixels(): Unsupported pixel zoom\n");
       return false;
    }
 

From 3c8b97a45b824712452b1425ed5cac3b97286439 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 28 Jul 2015 10:47:35 -0700
Subject: [PATCH 092/241] i965/gen9: Don't use fast copy blit in case of non
 power of 2 cpp

Fast copy blit is currently enabled for use only with Yf/Ys tiling.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/intel_blit.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index bd204aa3ce8..474a476c541 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -427,8 +427,10 @@ can_fast_copy_blit(struct brw_context *brw,
    if ((dst_offset | src_offset) & 63)
       return false;
 
-   /* Color depth greater than 128 bits not supported. */
-   if (cpp > 16)
+   /* Color depths which are not power of 2 or greater than 128 bits are
+    * not supported.
+    */
+   if (!_mesa_is_pow_two(cpp) || cpp > 16)
       return false;
 
    /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15

From 5cbe01c83fb78009371535e558eb21e213312416 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 3 Nov 2015 10:31:45 -0800
Subject: [PATCH 093/241] i965/gen9: Remove regions overlap check in fast copy
 blit

Overlapping blits are anyway undefined in OpenGL. So no need
of overlap check here.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/intel_blit.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 474a476c541..d4e25d89a55 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -406,11 +406,6 @@ can_fast_copy_blit(struct brw_context *brw,
    if (brw->gen < 9)
       return false;
 
-   if (src_buffer->handle == dst_buffer->handle &&
-       _mesa_regions_overlap(src_x, src_y, src_x + w, src_y + h,
-                             dst_x, dst_y, dst_x + w, dst_y + h))
-      return false;
-
    /* Enable fast copy blit only if the surfaces are Yf/Ys tiled.
     * FIXME: Based on performance data, remove this condition later to
     * enable for all types of surfaces.

From 0bf037c0fed0df655a3bb259348bb03389c00ddb Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 10 Nov 2015 15:33:53 -0800
Subject: [PATCH 094/241] i965/gen9: Return false in place of assert in
 intelEmitCopyBlit()

This allows the fallback paths to handle it correctly.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/intel_blit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index d4e25d89a55..6d29fbdde21 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -564,9 +564,10 @@ intelEmitCopyBlit(struct brw_context *brw,
                                            dst_offset, dst_pitch,
                                            dst_tiling, dst_tr_mode,
                                            w, h, cpp);
-   assert(use_fast_copy_blit ||
-          (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
-           dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE));
+   if (!use_fast_copy_blit &&
+       (src_tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        dst_tr_mode != INTEL_MIPTREE_TRMODE_NONE))
+      return false;
 
    if (use_fast_copy_blit) {
       /* When two sequential fast copy blits have different source surfaces,

From 4d2a7f511169ed4c7f63189f21f2acc7577da94a Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Tue, 24 Mar 2015 16:07:40 -0700
Subject: [PATCH 095/241] i965/gen9: Modify the conditions to use blitter on
 skl+

Conditions modified allow skl+ to use blitter:
 - for all tiling formats
 - to write data to YF/YS tiled surfaces

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 88c0a19bed6..108dd87dd8b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2697,13 +2697,17 @@ use_intel_mipree_map_blit(struct brw_context *brw,
 {
    if (brw->has_llc &&
       /* It's probably not worth swapping to the blit ring because of
-       * all the overhead involved.
+       * all the overhead involved. But, we must use blitter for the
+       * surfaces with INTEL_MIPTREE_TRMODE_{YF,YS}.
        */
-       !(mode & GL_MAP_WRITE_BIT) &&
+       (!(mode & GL_MAP_WRITE_BIT) ||
+        mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) &&
        !mt->compressed &&
        (mt->tiling == I915_TILING_X ||
         /* Prior to Sandybridge, the blitter can't handle Y tiling */
-        (brw->gen >= 6 && mt->tiling == I915_TILING_Y)) &&
+        (brw->gen >= 6 && mt->tiling == I915_TILING_Y) ||
+        /* Fast copy blit on skl+ supports all tiling formats. */
+        brw->gen >= 9) &&
        can_blit_slice(mt, level, slice))
       return true;
 
@@ -2772,6 +2776,8 @@ intel_miptree_map(struct brw_context *brw,
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
+      /* intel_miptree_map_gtt() doesn't support surfaces with Yf/Ys tiling. */
+      assert(mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE);
       intel_miptree_map_gtt(brw, mt, map, level, slice);
    }
 

From 0508d9504aa71cd5032ec27a8ab1127037f2df78 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 24 Dec 2015 09:50:59 +1100
Subject: [PATCH 096/241] glsl: only add outward facing varyings to resourse
 list for SSO

An SSO program can have multiple stages and we only want to add the externally
facing varyings. The current code was adding both the packed inputs and outputs
for the first and last stage of each program.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/linker.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a6e81b41f3c..acc63ae8cf6 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3433,7 +3433,7 @@ add_interface_variables(struct gl_shader_program *shProg,
 }
 
 static bool
-add_packed_varyings(struct gl_shader_program *shProg, int stage)
+add_packed_varyings(struct gl_shader_program *shProg, int stage, GLenum type)
 {
    struct gl_shader *sh = shProg->_LinkedShaders[stage];
    GLenum iface;
@@ -3454,10 +3454,13 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage)
          default:
             unreachable("unexpected type");
          }
-         if (!add_program_resource(shProg, iface, var,
-                                   build_stageref(shProg, var->name,
-                                                  var->data.mode)))
-            return false;
+
+         if (type == iface) {
+            if (!add_program_resource(shProg, iface, var,
+                                      build_stageref(shProg, var->name,
+                                                     var->data.mode)))
+               return false;
+         }
       }
    }
    return true;
@@ -3724,9 +3727,9 @@ build_program_resource_list(struct gl_shader_program *shProg)
 
    /* Program interface needs to expose varyings in case of SSO. */
    if (shProg->SeparateShader) {
-      if (!add_packed_varyings(shProg, input_stage))
+      if (!add_packed_varyings(shProg, input_stage, GL_PROGRAM_INPUT))
          return;
-      if (!add_packed_varyings(shProg, output_stage))
+      if (!add_packed_varyings(shProg, output_stage, GL_PROGRAM_OUTPUT))
          return;
    }
 

From 21590a307cac5cf9fc963f0700131c2d8b0d9731 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 23 Dec 2015 14:11:04 +1100
Subject: [PATCH 097/241] glsl: move lowering after matching validation

After lowering the matching flag is_unmatched_generic_inout is lost so
we need to move this validation before lowering.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_varyings.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 1ff25b85253..6119eff2fa3 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1629,17 +1629,6 @@ assign_varying_locations(struct gl_context *ctx,
    hash_table_dtor(consumer_inputs);
    hash_table_dtor(consumer_interface_inputs);
 
-   if (!disable_varying_packing) {
-      if (producer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                               0, producer);
-      }
-      if (consumer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               consumer_vertices, consumer);
-      }
-   }
-
    if (consumer && producer) {
       foreach_in_list(ir_instruction, node, consumer->ir) {
          ir_variable *const var = node->as_variable();
@@ -1689,6 +1678,17 @@ assign_varying_locations(struct gl_context *ctx,
       }
    }
 
+   if (!disable_varying_packing) {
+      if (producer) {
+         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
+                               0, producer);
+      }
+      if (consumer) {
+         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
+                               consumer_vertices, consumer);
+      }
+   }
+
    return true;
 }
 

From e1e1b678785e112326cc68c40990460deff05abc Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 23 Dec 2015 14:26:49 +1100
Subject: [PATCH 098/241] glsl: don't change the varying type in validation
 code

There is a function dedicated to demoting unused varyings lets
trust it to do its job.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_varyings.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 6119eff2fa3..c43abbcc6a5 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1669,11 +1669,6 @@ assign_varying_locations(struct gl_context *ctx,
 			    var->name,
                             _mesa_shader_stage_to_string(producer->Stage));
             }
-
-            /* An 'in' variable is only really a shader input if its
-             * value is written by the previous stage.
-             */
-            var->data.mode = ir_var_auto;
          }
       }
    }

From 3d402d445003f00478d7eb51eec8e4f31fef9352 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 6 Jan 2016 11:27:05 +1100
Subject: [PATCH 099/241] mesa: fix GL_MAX_NAME_LENGTH query for tessellation
 shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes some piglit subtests for ARB_program_interface_query.

V3: remove some of the unnecessary parentheses
V2: fix alignment

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/shader_query.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index e526119db19..b25732a2e3b 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -764,8 +764,12 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
 static bool
 add_index_to_name(struct gl_program_resource *res)
 {
-   bool add_index = !(((res->Type == GL_PROGRAM_INPUT) &&
-                       res->StageReferences & (1 << MESA_SHADER_GEOMETRY)));
+   bool add_index = !((res->Type == GL_PROGRAM_INPUT &&
+                       res->StageReferences & (1 << MESA_SHADER_GEOMETRY |
+                                               1 << MESA_SHADER_TESS_CTRL |
+                                               1 << MESA_SHADER_TESS_EVAL)) ||
+                      (res->Type == GL_PROGRAM_OUTPUT &&
+                       res->StageReferences & 1 << MESA_SHADER_TESS_CTRL));
 
    /* Transform feedback varyings have array index already appended
     * in their names.

From 25b7e4a01f20df4ca94a5381dee5b33294f6c161 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 5 Jan 2016 04:46:33 -0800
Subject: [PATCH 100/241] i965: Use rcp in brw_lower_texture_gradients rather
 than 1.0 / x.

That's what it's for.  Plus, we actually implement rcp.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
index d571ecd4394..c83b2728b98 100644
--- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
@@ -279,7 +279,7 @@ lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 
       /* 2. quotient rule */
       ir_variable *recip = temp(mem_ctx, glsl_type::float_type, "recip");
-      EMIT(assign(recip, div(new(mem_ctx) ir_constant(1.0f), swizzle_z(Q))));
+      EMIT(assign(recip, expr(ir_unop_rcp, swizzle_z(Q))));
 
       ir_variable *dx = temp(mem_ctx, glsl_type::vec2_type, "dx");
       ir_variable *dy = temp(mem_ctx, glsl_type::vec2_type, "dy");

From bd21b54607615605b6335282029687bb0885f4ad Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 5 Jan 2016 02:54:50 -0800
Subject: [PATCH 101/241] i965: Only turn on ARB_compute_shader if we can write
 registers.

Compute shaders require reconfiguring the L3 for shared local memory
support.  We have to be able to write the L3 registers to do that.

This effectively turns off compute shaders prior to Kernel 4.2.

(Previously, the extension enable was in an API_OPENGL_CORE conditional.
However, that isn't necessary - core Mesa extension handling already
restricts it properly.  I've moved it out in this patch.)

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/intel_extensions.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index e1338e92e15..889f7cbb5c1 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -346,6 +346,9 @@ intelInitExtensions(struct gl_context *ctx)
          ctx->Extensions.ARB_transform_feedback3 = true;
          ctx->Extensions.ARB_transform_feedback_instanced = true;
 
+         if (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024)
+            ctx->Extensions.ARB_compute_shader = true;
+
          if (brw->intelScreen->cmd_parser_version >= 2)
             brw->predicate.supported = true;
       }
@@ -357,8 +360,6 @@ intelInitExtensions(struct gl_context *ctx)
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
          ctx->Extensions.ARB_shader_subroutine = true;
-         if (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024)
-            ctx->Extensions.ARB_compute_shader = true;
       }
    }
 

From 7295f4fcc2b2dd1bc6a8d1d834774b8152a029cf Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 5 Jan 2016 05:09:46 -0800
Subject: [PATCH 102/241] nir: Add a lower_fdiv option, turn fdiv into
 fmul/frcp.

The nir_opt_algebraic rule

(('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),

can produce new fdiv operations, which need to be lowered on i965,
as we don't actually implement fdiv.  (Normally, we handle this in
GLSL IR's lower_instructions pass, but in the above case we introduce
an fdiv after that point.  So, make NIR do it for us.)

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/glsl/nir/nir.h                       | 1 +
 src/glsl/nir/nir_opt_algebraic.py        | 1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 42867382544..fed8a973416 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1440,6 +1440,7 @@ typedef struct nir_function {
 } nir_function;
 
 typedef struct nir_shader_compiler_options {
+   bool lower_fdiv;
    bool lower_ffma;
    bool lower_flrp;
    bool lower_fpow;
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index 1fdad3d78a6..c553de577ee 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -183,6 +183,7 @@ optimizations = [
    (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
    # Division and reciprocal
    (('fdiv', 1.0, a), ('frcp', a)),
+   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
    (('frcp', ('frcp', a)), a),
    (('frcp', ('fsqrt', a)), ('frsq', a)),
    (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d4b6410815e..4bd24a70b55 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -97,6 +97,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    nir_shader_compiler_options *nir_options =
       rzalloc(compiler, nir_shader_compiler_options);
    nir_options->native_integers = true;
+   nir_options->lower_fdiv = true;
    /* In order to help allow for better CSE at the NIR level we tell NIR
     * to split all ffma instructions during opt_algebraic and we then
     * re-combine them as a later step.

From afe88f66a8a9cf3c6bf6ea5d3e00589c22219c30 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 3 Dec 2015 09:11:04 +0200
Subject: [PATCH 103/241] configure.ac: Detect if running on POWER8 arch

To determine if we could use special POWER8 assembly directives, we first
need to detect whether we are running on POWER8 architecture. This patch
adds this detection to configure.ac and adds the necessary compilation
flags accordingly.

v2:

- Add option to disable POWER8 instructions generation
- Detect whether building on BE or LE machine and build with
  -mpower8-vector only on LE machine
- Make the printed messages more standard

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 configure.ac | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/configure.ac b/configure.ac
index f8a70bef0e2..b1c1d7df4d8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -396,6 +396,61 @@ fi
 AM_CONDITIONAL([SSE41_SUPPORTED], [test x$SSE41_SUPPORTED = x1])
 AC_SUBST([SSE41_CFLAGS], $SSE41_CFLAGS)
 
+dnl Check for Endianness
+AC_C_BIGENDIAN(
+   little_endian=no,
+   little_endian=yes,
+   little_endian=no,
+   little_endian=no
+)
+
+dnl Check for POWER8 Architecture
+PWR8_CFLAGS="-mpower8-vector"
+have_pwr8_intrinsics=no
+AC_MSG_CHECKING(whether gcc supports -mpower8-vector)
+save_CFLAGS=$CFLAGS
+CFLAGS="$PWR8_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+#error "Need GCC >= 4.8 for sane POWER8 support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned char r;
+    vector unsigned int v = vec_splat_u32 (1);
+    r = __builtin_vec_vgbbd ((vector unsigned char) v);
+    return 0;
+}]])], have_pwr8_intrinsics=yes)
+CFLAGS=$save_CFLAGS
+
+AC_ARG_ENABLE(pwr8,
+   [AC_HELP_STRING([--disable-pwr8-inst],
+                   [disable POWER8-specific instructions])],
+   [enable_pwr8=$enableval], [enable_pwr8=auto])
+
+if test "x$enable_pwr8" = xno ; then
+   have_pwr8_intrinsics=disabled
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = yes ; then
+   DEFINES="$DEFINES -D_ARCH_PWR8"
+   CXXFLAGS="$CXXFLAGS $PWR8_CFLAGS"
+   CFLAGS="$CFLAGS $PWR8_CFLAGS"
+else
+   PWR8_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_pwr8_intrinsics)
+if test "x$enable_pwr8" = xyes && test $have_pwr8_intrinsics = no ; then
+   AC_MSG_ERROR([POWER8 compiler support not detected])
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = no ; then
+   AC_MSG_WARN([POWER8 optimization is enabled only on POWER8 Little-Endian])
+fi
+
+AC_SUBST([PWR8_CFLAGS], $PWR8_CFLAGS)
+
 dnl Can't have static and shared libraries, default to static if user
 dnl explicitly requested. If both disabled, set to static since shared
 dnl was explicitly requested.

From e99555ef0bf1b786a1bf1e93f3304507dbb6e939 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 3 Dec 2015 09:11:13 +0200
Subject: [PATCH 104/241] llvmpipe: add POWER8 portability file - u_pwr8.h

This file provides a portability layer that will make it easier to convert
SSE-based functions to VMX/VSX-based functions.

All the functions implemented in this file are prefixed using "vec_".
Therefore, when converting from SSE-based function, one needs to simply
replace the "_mm_" prefix of the SSE function being called to "vec_".

Having said that, not all functions could be converted as such, due to the
differences between the architectures. So, when doing such
conversion hurt the performance, I preferred to implement a more ad-hoc
solution. For example, converting the _mm_shuffle_epi32 needed to be done
using ad-hoc masks instead of a generic function.

All the functions in this file support both little-endian and big-endian
but currently the file is build only on POWER8 LE machine.

All of the functions are implemented using the Altivec/VMX intrinsics,
except one where I needed to use inline assembly (due to missing
intrinsic).

v2:
- Use vec_vgbbd instead of __builtin_vec_vgbbd
- Add an aligned load function
- Don't use typeof()
- Make file build only on POWER8 LE machine

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/util/u_pwr8.h | 310 ++++++++++++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 src/gallium/auxiliary/util/u_pwr8.h

diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h
new file mode 100644
index 00000000000..1eca6d6df2c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2015 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Oded Gabbay <oded.gabbay@redhat.com>
+ */
+
+/**
+ * @file
+ * POWER8 intrinsics portability header.
+ *
+ */
+
+#ifndef U_PWR8_H_
+#define U_PWR8_H_
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
+
+typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
+
+typedef VECTOR_ALIGN_16 union m128i {
+   __m128i m128i;
+   vector signed int m128si;
+   vector unsigned int m128ui;
+   ubyte ub[16];
+   ushort us[8];
+   int i[4];
+   uint ui[4];
+} __m128i_union;
+
+static inline __m128i
+vec_set_epi32 (int i3, int i2, int i1, int i0)
+{
+   __m128i_union vdst;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vdst.i[0] = i0;
+   vdst.i[1] = i1;
+   vdst.i[2] = i2;
+   vdst.i[3] = i3;
+#else
+   vdst.i[3] = i0;
+   vdst.i[2] = i1;
+   vdst.i[1] = i2;
+   vdst.i[0] = i3;
+#endif
+
+   return (__m128i) vdst.m128si;
+}
+
+static inline __m128i
+vec_setr_epi32 (int i0, int i1, int i2, int i3)
+{
+  return vec_set_epi32 (i3, i2, i1, i0);
+}
+
+static inline __m128i
+vec_unpacklo_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
+#else
+      {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
+#else
+      {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpacklo_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
+#else
+      {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+#else
+      {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_add_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
+}
+
+static inline __m128i
+vec_sub_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
+}
+
+/* Call this function ONLY on POWER8 and newer platforms */
+static inline __m128i
+vec_mullo_epi32 (__m128i a, __m128i b)
+{
+   __m128i v;
+
+   __asm__(
+           "vmuluwm %0, %1, %2   \n"
+           : "=v" (v)
+           : "v" (a), "v" (b)
+           );
+
+   return v;
+}
+
+static inline void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+   __m128i t0 = vec_unpacklo_epi32(*a, *b);
+   __m128i t1 = vec_unpacklo_epi32(*c, *d);
+   __m128i t2 = vec_unpackhi_epi32(*a, *b);
+   __m128i t3 = vec_unpackhi_epi32(*c, *d);
+
+   *o = vec_unpacklo_epi64(t0, t1);
+   *p = vec_unpackhi_epi64(t0, t1);
+   *q = vec_unpacklo_epi64(t2, t3);
+   *r = vec_unpackhi_epi64(t2, t3);
+}
+
+static inline __m128i
+vec_slli_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_srli_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_srai_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_cmpeq_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b);
+}
+
+static inline __m128i
+vec_loadu_si128 (const uint32_t* src)
+{
+   __m128i_union vsrc;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+
+   vsrc.m128ui = *((vector unsigned int *) src);
+
+#else
+
+   __m128i vmask, tmp1, tmp2;
+
+   vmask = vec_lvsl(0, src);
+
+   tmp1 = (__m128i) vec_ld (0, src);
+   tmp2 = (__m128i) vec_ld (15, src);
+   vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask);
+
+#endif
+
+   return vsrc.m128i;
+}
+
+static inline __m128i
+vec_load_si128 (const uint32_t* src)
+{
+   __m128i_union vsrc;
+
+   vsrc.m128ui = *((vector unsigned int *) src);
+
+   return vsrc.m128i;
+}
+
+static inline void
+vec_store_si128 (uint32_t* dest, __m128i vdata)
+{
+   vec_st ((vector unsigned int) vdata, 0, dest);
+}
+
+/* Call this function ONLY on POWER8 and newer platforms */
+static inline int
+vec_movemask_epi8 (__m128i vsrc)
+{
+   __m128i_union vtemp;
+   int result;
+
+   vtemp.m128i = vec_vgbbd(vsrc);
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   result = vtemp.ub[15] << 8 | vtemp.ub[7];
+#else
+   result = vtemp.ub[0] << 8 | vtemp.ub[8];
+#endif
+
+   return result;
+}
+
+static inline __m128i
+vec_packs_epi16 (__m128i a, __m128i b)
+{
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   return (__m128i) vec_packs ((vector signed short) a,
+                               (vector signed short) b);
+#else
+   return (__m128i) vec_packs ((vector signed short) b,
+                               (vector signed short) a);
+#endif
+}
+
+static inline __m128i
+vec_packs_epi32 (__m128i a, __m128i b)
+{
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b);
+#else
+   return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a);
+#endif
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+#endif /* U_PWR8_H_ */

From 3bbe16ea79bb5738109df36780cc99119a006d91 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 13 Dec 2015 17:49:32 +0200
Subject: [PATCH 105/241] llvmpipe: Optimize do_triangle_ccw for POWER8

This patch converts the SSE optimization done in do_triangle_ccw to
VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

                      FPS/Score
  Name            Before     After    Delta
------------------------------------------------
glmark2 (score)   136.6      139.8    2.34%
openarena         16.14      16.35    1.30%
xonotic           4.655      4.707    1.11%

v2:

- Convert loads to use aligned loads
- Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 100 ++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b1671dd0ae2..0ff10a2027d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -46,6 +46,9 @@
 
 #if defined(PIPE_ARCH_SSE)
 #include <emmintrin.h>
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#include <altivec.h>
+#include "util/u_pwr8.h"
 #endif
 
 static inline int
@@ -462,6 +465,103 @@ do_triangle_ccw(struct lp_setup_context *setup,
       STORE_PLANE(plane[2], p2);
 #undef STORE_PLANE
    } else
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+   /*
+    * XXX this code is effectively disabled for all practical purposes,
+    * as the allowed fb size is tiny if FIXED_ORDER is 8.
+    */
+   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+       setup->fb.height <= MAX_FIXED_LENGTH32 &&
+       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+      unsigned int bottom_edge;
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i_union vshuf_mask;
+      __m128i zero = vec_splats((unsigned char) 0);
+      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      vshuf_mask.i[0] = 0x07060504;
+      vshuf_mask.i[1] = 0x0B0A0908;
+      vshuf_mask.i[2] = 0x03020100;
+      vshuf_mask.i[3] = 0x0F0E0D0C;
+#else
+      vshuf_mask.i[0] = 0x00010203;
+      vshuf_mask.i[1] = 0x0C0D0E0F;
+      vshuf_mask.i[2] = 0x04050607;
+      vshuf_mask.i[3] = 0x08090A0B;
+#endif
+
+      /* vertex x coords */
+      vertx = vec_load_si128((const uint32_t *) position->x);
+      /* vertex y coords */
+      verty = vec_load_si128((const uint32_t *) position->y);
+
+      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
+      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
+
+      dcdx = vec_sub_epi32(verty, shufy);
+      dcdy = vec_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
+
+      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
+      top_left_flag = (__m128i) vec_splats(bottom_edge);
+
+      c_inc_mask = vec_or(dcdx_neg_mask,
+                                vec_and(dcdx_zero_mask,
+                                              vec_xor(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = vec_srli_epi32(c_inc_mask, 31);
+
+      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
+                        vec_mullo_epi32(dcdy, verty));
+
+      c = vec_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
+       */
+      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+                         vec_and(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+      /* Pointless transpose which gets undone immediately in
+       * rasterization:
+       */
+      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+                       &p0, &p1, &p2, &unused);
+
+#define STORE_PLANE(plane, vec) do {                  \
+         vec_store_si128((uint32_t *)&temp_vec, vec); \
+         plane.c    = (int64_t)temp_vec[0];           \
+         plane.dcdx = temp_vec[1];                    \
+         plane.dcdy = temp_vec[2];                    \
+         plane.eo   = temp_vec[3];                    \
+      } while(0)
+
+      STORE_PLANE(plane[0], p0);
+      STORE_PLANE(plane[1], p1);
+      STORE_PLANE(plane[2], p2);
+#undef STORE_PLANE
+   } else
 #endif
    {
       int i;

From 925c46cfc48042ec0bc5a83df962d2d7dd038394 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Tue, 29 Dec 2015 18:12:34 +0200
Subject: [PATCH 106/241] llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8

This patch converts the SSE-optimized build_mask_32() and
build_mask_linear_32() to VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

                      FPS/Score
  Name            Before     After    Delta
------------------------------------------------
glmark2 (score)   139.8      142.7    2.07%

openarena and xonotic didn't show a significant (more than 1%)
difference.

v2: Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 150 +++++++++++++++------
 1 file changed, 110 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index c9b9221d87c..09a182ac84a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
    lp_rast_triangle_4(task, arg2);
 }
 
-#if !defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_SSE)
 
-void
-lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<3)-1;
-   lp_rast_triangle_32_3(task, arg2);
-}
-
-void
-lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<4)-1;
-   lp_rast_triangle_32_4(task, arg2);
-}
-
-void
-lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
-{
-   lp_rast_triangle_32_3_16(task, arg);
-}
-
-#else
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 #define NR_PLANES 3
 
-
-
-
-
-
-
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                                0xffff & ~out[i].mask);
 }
 
-
-
-
-
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
@@ -471,6 +433,114 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 }
 
 #undef NR_PLANES
+
+#else
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#include <altivec.h>
+#include "util/u_pwr8.h"
+
+static inline void
+build_masks_32(int c,
+               int cdiff,
+               int dcdx,
+               int dcdy,
+               unsigned *outmask,
+               unsigned *partmask)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *outmask |= vec_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = (__m128i) vec_splats(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = vec_add_epi32(cstep0, cio4);
+      cstep1 = vec_add_epi32(cstep1, cio4);
+      cstep2 = vec_add_epi32(cstep2, cio4);
+      cstep3 = vec_add_epi32(cstep3, cio4);
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *partmask |= vec_movemask_epi8(result);
+   }
+}
+
+static inline unsigned
+build_mask_linear_32(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = vec_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return vec_movemask_epi8(result);
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_32_3_16(task, arg);
+}
+
 #endif
 
 
@@ -512,7 +582,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"
 
-#ifdef PIPE_ARCH_SSE
+#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
 #undef BUILD_MASKS
 #undef BUILD_MASK_LINEAR
 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)

From 9d59b9d00cdb1e0e8bd139fba5250df869727386 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Tue, 29 Dec 2015 18:12:35 +0200
Subject: [PATCH 107/241] llvmpipe: Optimize lp_rast_triangle_32_3_16 for
 POWER8

This patch converts the SSE-optimized lp_rast_triangle_32_3_16()
to VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

                      FPS/Score
 Name            Before     After    Delta
------------------------------------------------
openarena        16.35      16.7     2.14%
xonotic          4.707      4.97     5.57%

glmark2 didn't show a significant (more than 1%) difference.

v2: Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 142 ++++++++++++++++++++-
 1 file changed, 141 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 09a182ac84a..232c8599e42 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -512,7 +512,145 @@ build_mask_linear_32(int c, int dcdx, int dcdy)
    return vec_movemask_epi8(result);
 }
 
-#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+static inline __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+                         (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#define NR_PLANES 3
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = vec_splats((unsigned char) 0);
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+
+   __m128i vshuf_mask0;
+   __m128i vshuf_mask1;
+   __m128i vshuf_mask2;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
+#else
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
+#endif
+
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = vec_sub_epi32(zero, dcdx);
+
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
+   rej4 = vec_slli_epi32(rej4, 2);
+
+   /*
+    * Adjust so we can just check the sign bit (< 0 comparison),
+    * instead of having to do a less efficient <= 0 comparison
+    */
+   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
+   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
+
+   dcdx2 = vec_add_epi32(dcdx, dcdx);
+   dcdx3 = vec_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
+
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = vec_add_epi32(cx, rej4);
+         __m128i rej_masks = vec_srai_epi32(c4rej, 31);
+
+         /* if (is_zero(rej_masks)) */
+         if (vec_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
+            __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
+            __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
+
+            __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
+            __m128i c_01 = vec_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
+
+            __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
+            __m128i c_23 = vec_packs_epi32(c_2, c_3);
+            __m128i c_0123 = vec_packs_epi16(c_01, c_23);
+
+            unsigned mask = vec_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
+      }
+
+      c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
+   }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
+}
+
+#undef NR_PLANES
+
+#else
 
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
@@ -524,6 +662,8 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    lp_rast_triangle_32_3(task, arg2);
 }
 
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
 void
 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
                          const union lp_rast_cmd_arg arg)

From 67d4b4b28c358845f0c0b9f6cacd5e611c746313 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Tue, 5 Jan 2016 21:07:18 +1100
Subject: [PATCH 108/241] gallium: Remove unnecessary semicolons

Fix silly issue with MSVC case fall-though support to need
a extra 'break;'

Found-by: Coccinelle
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/draw/draw_pipe_aaline.c  | 2 +-
 src/gallium/auxiliary/gallivm/lp_bld_swizzle.c | 2 +-
 src/gallium/auxiliary/nir/tgsi_to_nir.c        | 2 +-
 src/gallium/auxiliary/util/u_surface.c         | 3 ++-
 src/gallium/auxiliary/vl/vl_mpeg12_decoder.c   | 2 +-
 src/gallium/state_trackers/nine/swapchain9.c   | 2 +-
 src/gallium/state_trackers/omx/entrypoint.c    | 2 +-
 src/gallium/state_trackers/vdpau/mixer.c       | 2 +-
 8 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 3ce550a3ae8..e85ae16c1df 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -938,7 +938,7 @@ draw_aaline_prepare_outputs(struct draw_context *draw,
    const struct pipe_rasterizer_state *rast = draw->rasterizer;
 
    /* update vertex attrib info */
-   aaline->pos_slot = draw_current_shader_position_output(draw);;
+   aaline->pos_slot = draw_current_shader_position_output(draw);
 
    if (!rast->line_smooth)
       return;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index b1aef715e20..f5718389f33 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -720,7 +720,7 @@ lp_build_transpose_aos_n(struct gallivm_state *gallivm,
 
       default:
          assert(0);
-   };
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 94d992b0031..7c577592f70 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1950,7 +1950,7 @@ tgsi_processor_to_shader_stage(unsigned processor)
    case TGSI_PROCESSOR_COMPUTE:   return MESA_SHADER_COMPUTE;
    default:
       unreachable("invalid TGSI processor");
-   };
+   }
 }
 
 struct nir_shader *
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index 6aa44f9602a..c150d92b967 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -600,7 +600,8 @@ is_box_inside_resource(const struct pipe_resource *res,
       depth = res->array_size;
       assert(res->array_size % 6 == 0);
       break;
-   case PIPE_MAX_TEXTURE_TYPES:;
+   case PIPE_MAX_TEXTURE_TYPES:
+      break;
    }
 
    return box->x >= 0 &&
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index f5bb3a0106f..b5c70451ce8 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -792,7 +792,7 @@ vl_mpeg12_end_frame(struct pipe_video_codec *decoder,
       for (j = 0; j < VL_MAX_REF_FRAMES; ++j) {
          if (!ref_frames[j] || !ref_frames[j][i]) continue;
 
-         vb[2] = vl_vb_get_mv(&buf->vertex_stream, j);;
+         vb[2] = vl_vb_get_mv(&buf->vertex_stream, j);
          dec->context->set_vertex_buffers(dec->context, 0, 3, vb);
 
          vl_mc_render_ref(i ? &dec->mc_c : &dec->mc_y, &buf->mc[i], ref_frames[j][i]);
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 3f5be26fed7..3b1a7a4493c 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -790,7 +790,7 @@ NineSwapChain9_Present( struct NineSwapChain9 *This,
         case D3DSWAPEFFECT_FLIP:
             UNTESTED(4);
         case D3DSWAPEFFECT_DISCARD:
-            /* rotate the queue */;
+            /* rotate the queue */
             pipe_resource_reference(&res, This->buffers[0]->base.resource);
             for (i = 1; i <= This->params.BackBufferCount; i++) {
                 NineSurface9_SetResourceResize(This->buffers[i - 1],
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c
index da9ca104d93..afcbd974e76 100644
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -137,7 +137,7 @@ OMX_ERRORTYPE omx_workaround_Destructor(OMX_COMPONENTTYPE *comp)
    priv->state = OMX_StateInvalid;
    tsem_up(priv->messageSem);
 
-   /* wait for thread to exit */;
+   /* wait for thread to exit */
    pthread_join(priv->messageHandlerThread, NULL);
 
    return omx_base_component_Destructor(comp);
diff --git a/src/gallium/state_trackers/vdpau/mixer.c b/src/gallium/state_trackers/vdpau/mixer.c
index c0b1ecc55fa..dec79ff95e2 100644
--- a/src/gallium/state_trackers/vdpau/mixer.c
+++ b/src/gallium/state_trackers/vdpau/mixer.c
@@ -294,7 +294,7 @@ VdpStatus vlVdpVideoMixerRender(VdpVideoMixer mixer,
    default:
       pipe_mutex_unlock(vmixer->device->mutex);
       return VDP_STATUS_INVALID_VIDEO_MIXER_PICTURE_STRUCTURE;
-   };
+   }
 
    if (deinterlace != VL_COMPOSITOR_WEAVE && vmixer->deint.enabled &&
        video_surface_past_count > 1 && video_surface_future_count > 0) {

From bfabd5e74a8898a470c91924cbcf95e6876fbe95 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Tue, 5 Jan 2016 21:07:19 +1100
Subject: [PATCH 109/241] gallium/drivers: Remove unnecessary semicolons

Found-by: Coccinelle
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/i915/i915_state.c             | 2 +-
 src/gallium/drivers/ilo/shader/ilo_shader_vs.c    | 2 +-
 src/gallium/drivers/llvmpipe/lp_test_blend.c      | 2 +-
 src/gallium/drivers/llvmpipe/lp_test_conv.c       | 2 +-
 src/gallium/drivers/r300/compiler/r500_fragprog.c | 2 +-
 src/gallium/drivers/r600/r600_shader.c            | 2 +-
 src/gallium/drivers/radeonsi/cik_sdma.c           | 2 +-
 src/gallium/drivers/softpipe/sp_query.c           | 2 +-
 src/gallium/drivers/svga/svga_cmd.c               | 4 ++--
 src/gallium/drivers/vc4/vc4_program.c             | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 6ba9646f7ab..d1661fed3f7 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -423,7 +423,7 @@ i915_prepare_vertex_sampling(struct i915_context *i915)
          for (j = view->u.tex.first_level; j <= tex->last_level; j++) {
             mip_offsets[j] = i915_texture_offset(i915_tex, j , 0 /* FIXME depth */);
             row_stride[j] = i915_tex->stride;
-            img_stride[j] = 0; /* FIXME */;
+            img_stride[j] = 0; /* FIXME */
          }
 
          draw_set_mapped_texture(i915->draw,
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index a29baab10c1..46a7e6f69d0 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -126,7 +126,7 @@ vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc,
    tc_MOV(tc, block_offsets, idx);
 
    msg_type = GEN6_MSG_DP_OWORD_DUAL_BLOCK_READ;
-   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;;
+   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;
    msg_len = 2;
 
    desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false,
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 7b19174f345..9139b83f05a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -184,7 +184,7 @@ add_blend_test(struct gallivm_state *gallivm,
 
    LLVMBuildStore(builder, res, res_ptr);
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    gallivm_verify_function(gallivm, func);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index a30f35c8149..02a63193af5 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -140,7 +140,7 @@ add_conv_test(struct gallivm_state *gallivm,
       LLVMBuildStore(builder, dst[i], ptr);
    }
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    gallivm_verify_function(gallivm, func);
 
diff --git a/src/gallium/drivers/r300/compiler/r500_fragprog.c b/src/gallium/drivers/r300/compiler/r500_fragprog.c
index 88aad8a054f..4c415afcb05 100644
--- a/src/gallium/drivers/r300/compiler/r500_fragprog.c
+++ b/src/gallium/drivers/r300/compiler/r500_fragprog.c
@@ -384,7 +384,7 @@ void r500FragmentProgramDump(struct radeon_compiler *c, void *user)
     case R500_INST_TYPE_OUT: str = "OUT"; break;
     case R500_INST_TYPE_FC: str = "FC"; break;
     case R500_INST_TYPE_TEX: str = "TEX"; break;
-    };
+    }
     fprintf(stderr,"%s %s %s %s %s ", str,
 	    inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
 	    inst & R500_INST_LAST ? "LAST" : "",
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index c1565498ea5..07d06aa24df 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -4427,7 +4427,7 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ctx->inst_info->op;
 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
+				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
 			}
 			alu.dst.sel = t1;
 			alu.dst.chan = i;
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 2de237b4716..105a1b2a878 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -196,7 +196,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 			(tile_split << 11) | (mt << 8) | (array_mode << 3) |
 			lbpe;
 		cs->buf[cs->cdw++] = y << 16; /* | x */
-		cs->buf[cs->cdw++] = 0; /* z */;
+		cs->buf[cs->cdw++] = 0; /* z */
 		cs->buf[cs->cdw++] = addr & 0xfffffffc;
 		cs->buf[cs->cdw++] = addr >> 32;
 		cs->buf[cs->cdw++] = (pitch / bpe) - 1;
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 76105b4c0ec..c28d28d5f5d 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -223,7 +223,7 @@ softpipe_get_query_result(struct pipe_context *pipe,
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       memcpy(vresult, &sq->stats,
-             sizeof(struct pipe_query_data_pipeline_statistics));;
+             sizeof(struct pipe_query_data_pipeline_statistics));
       break;
    case PIPE_QUERY_GPU_FINISHED:
       vresult->b = TRUE;
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 10442cb46e7..f35b1371ebe 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -337,7 +337,7 @@ SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,    // IN
    mipSizes[0].height = height;
    mipSizes[0].depth = 1;
 
-   swc->commit(swc);;
+   swc->commit(swc);
 
    return PIPE_OK;
 }
@@ -372,7 +372,7 @@ SVGA3D_DestroySurface(struct svga_winsys_context *swc,
    
    swc->surface_relocation(swc, &cmd->sid, NULL, sid,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
-   swc->commit(swc);;
+   swc->commit(swc);
 
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index da0d21111a0..44e89fe64e9 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -89,7 +89,7 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                 range->dst_offset = c->next_ubo_dst_offset;
                 c->next_ubo_dst_offset += range->size;
                 c->num_ubo_ranges++;
-        };
+        }
 
         offset -= range->src_offset;
 

From 5071c192ccbfac92c53d93aea049bf981ae9e442 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Tue, 5 Jan 2016 21:07:20 +1100
Subject: [PATCH 110/241] gallium: Use unsigned for loop index

Found-by: Coccinelle
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 09c1b379172..8c39ab0afe9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1197,7 +1197,7 @@ get_soa_array_offsets(struct lp_build_context *uint_bld,
 
    if (need_perelement_offset) {
       LLVMValueRef pixel_offsets;
-      int i;
+      unsigned i;
      /* build pixel offset vector: {0, 1, 2, 3, ...} */
       pixel_offsets = uint_bld->undef;
       for (i = 0; i < uint_bld->type.length; i++) {
@@ -1809,7 +1809,7 @@ emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *float_bld = &bld_base->base;
-   int i;
+   unsigned i;
    LLVMValueRef temp, temp2;
    LLVMValueRef shuffles[8];
    LLVMValueRef shuffles2[8];
@@ -2713,7 +2713,7 @@ static boolean
 near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
                    int pc)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < 5; i++) {
       unsigned opcode;

From 76a7d6f412f976b7bee57ff7e691fefd90e912ee Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Tue, 5 Jan 2016 21:07:21 +1100
Subject: [PATCH 111/241] gallium/drivers/ilo: Use unsigned for loop index

Found-by: Coccinelle
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/ilo/core/ilo_builder.c       |  8 ++++----
 src/gallium/drivers/ilo/shader/ilo_shader_fs.c   | 16 ++++++++--------
 src/gallium/drivers/ilo/shader/ilo_shader_vs.c   |  4 ++--
 src/gallium/drivers/ilo/shader/toy_legalize_ra.c |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c
index 9d5195129b7..079872f4306 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder,
                  const struct ilo_dev *dev,
                  struct intel_winsys *winsys)
 {
-   int i;
+   unsigned i;
 
    assert(ilo_is_zeroed(builder, sizeof(*builder)));
 
@@ -366,7 +366,7 @@ ilo_builder_init(struct ilo_builder *builder,
 void
 ilo_builder_reset(struct ilo_builder *builder)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++)
       ilo_builder_writer_reset(builder, i);
@@ -382,7 +382,7 @@ ilo_builder_reset(struct ilo_builder *builder)
 bool
 ilo_builder_begin(struct ilo_builder *builder)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++) {
       if (!ilo_builder_writer_alloc_and_map(builder, i)) {
@@ -407,7 +407,7 @@ struct intel_bo *
 ilo_builder_end(struct ilo_builder *builder, unsigned *used)
 {
    struct ilo_builder_writer *bat;
-   int i;
+   unsigned i;
 
    ilo_builder_batch_patch_sba(builder);
 
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
index 5250115a893..f46126e8427 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -266,7 +266,7 @@ fs_lower_opcode_tgsi_indirect_const(struct fs_compile_context *fcc,
    struct toy_inst *inst;
    struct toy_src desc, real_src[4];
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    tsrc_transpose(idx, real_src);
 
@@ -319,7 +319,7 @@ fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc,
    const int grf_subreg = (idx.val32 & 1) * 16;
    struct toy_src src;
    struct toy_dst real_dst[4];
-   int i;
+   unsigned i;
 
    if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
        grf >= fcc->first_attr_grf)
@@ -350,7 +350,7 @@ fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc,
    struct toy_inst *inst;
    struct toy_src desc;
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
       return;
@@ -396,7 +396,7 @@ fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc,
    struct toy_src desc;
    struct toy_inst *inst;
    struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
 
    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
       return;
@@ -1168,7 +1168,7 @@ fs_lower_opcode_derivative(struct toy_compiler *tc, struct toy_inst *inst)
 {
    struct toy_dst dst[4];
    struct toy_src src[4];
-   int i;
+   unsigned i;
 
    tdst_transpose(inst->dst, dst);
    tsrc_transpose(inst->src[0], src);
@@ -1257,7 +1257,7 @@ fs_lower_opcode_kil(struct toy_compiler *tc, struct toy_inst *inst)
    }
    else {
       struct toy_src src[4];
-      int i;
+      unsigned i;
 
       tsrc_transpose(inst->src[0], src);
       /* mask out killed pixels */
@@ -1583,7 +1583,7 @@ fs_write_fb(struct fs_compile_context *fcc)
 static void
 fs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi)
 {
-   int i;
+   unsigned i;
 
    sh->out.count = tgsi->num_outputs;
    for (i = 0; i < tgsi->num_outputs; i++) {
@@ -1603,7 +1603,7 @@ static void
 fs_setup_shader_in(struct ilo_shader *sh, const struct toy_tgsi *tgsi,
                    bool flatshade)
 {
-   int i;
+   unsigned i;
 
    sh->in.count = tgsi->num_inputs;
    for (i = 0; i < tgsi->num_inputs; i++) {
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index 46a7e6f69d0..0df0afc706b 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -522,7 +522,7 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
       if (num_coords >= 3) {
          struct toy_dst tmp, max;
          struct toy_src abs_coords[3];
-         int i;
+         unsigned i;
 
          tmp = tc_alloc_tmp(tc);
          max = tdst_writemask(tmp, TOY_WRITEMASK_W);
@@ -804,7 +804,7 @@ static int
 vs_collect_outputs(struct vs_compile_context *vcc, struct toy_src *outs)
 {
    const struct toy_tgsi *tgsi = &vcc->tgsi;
-   int i;
+   unsigned i;
 
    for (i = 0; i < vcc->shader->out.count; i++) {
       const int slot = vcc->output_map[i];
diff --git a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
index b725375fb67..1874faa6be3 100644
--- a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
+++ b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
@@ -70,7 +70,7 @@ struct linear_scan {
 static void
 linear_scan_free_regs(struct linear_scan *ls, int reg, int count)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < count; i++)
       ls->free_regs[ls->num_free_regs++] = reg + count - 1 - i;
@@ -221,7 +221,7 @@ linear_scan_spill(struct linear_scan *ls,
 static void
 linear_scan_spill_range(struct linear_scan *ls, int first, int count)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < count; i++) {
       struct linear_scan_live_interval *interval = &ls->intervals[first + i];

From 8e2a8ec731975aaeaf2188274c3b0c49eed36593 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Tue, 5 Jan 2016 21:07:22 +1100
Subject: [PATCH 112/241] gallium/drivers/r600: Use unsigned for loop index

Found-by: Coccinelle
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/r600/r600_shader.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 07d06aa24df..df40f94bdcf 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -395,7 +395,7 @@ static int tgsi_last_instruction(unsigned writemask)
 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
-	int j;
+	unsigned j;
 
 	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
@@ -1167,7 +1167,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
 */
 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 	int num_baryc;
 	struct tgsi_parse_context parse;
 
@@ -1586,7 +1586,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
@@ -1855,7 +1855,7 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
@@ -2785,7 +2785,7 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
 
 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 	int stride, outer_comps, inner_comps;
 	int tessinner_idx = -1, tessouter_idx = -1;
 	int r;
@@ -4794,7 +4794,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 	{
 		int chan;
 		int sel;
-		int i;
+		unsigned i;
 
 		if (ctx->bc->chip_class == CAYMAN) {
 			for (i = 0; i < 3; i++) {
@@ -7928,7 +7928,7 @@ static int tgsi_exp(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
 	int r;
-	int i;
+	unsigned i;
 
 	/* result.x = 2^floor(src); */
 	if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8057,7 +8057,7 @@ static int tgsi_log(struct r600_shader_ctx *ctx)
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_alu alu;
 	int r;
-	int i;
+	unsigned i;
 
 	/* result.x = floor(log2(|src|)); */
 	if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8784,7 +8784,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
 
 static int tgsi_endloop(struct r600_shader_ctx *ctx)
 {
-	int i;
+	unsigned i;
 
 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
 

From 1953cee6d7a9d0c948a05ffc7bbafff378cb1751 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Tue, 5 Jan 2016 21:07:23 +1100
Subject: [PATCH 113/241] gallium/drivers/svga: Use unsigned for loop index

Fix a 's/unsigned int/unsigned/' consistency case while here.

Found-by: Coccinelle
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 .../drivers/svga/svga_resource_buffer_upload.c       |  2 +-
 src/gallium/drivers/svga/svga_tgsi_insn.c            | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 8c5cff5abc1..ba1a1f222b6 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -221,7 +221,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    struct svga_3d_update_gb_image *whole_update_cmd = NULL;
    uint32 numBoxes = sbuf->map.num_ranges;
    struct pipe_resource *dummy;
-   unsigned int i;
+   unsigned i;
 
    assert(numBoxes);
    assert(sbuf->dma.updates == NULL);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index dbb90f7654e..970e70aabf9 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -166,7 +166,7 @@ scalar(struct src_register src, unsigned comp)
 static boolean
 svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
       if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -179,7 +179,7 @@ svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
 static int
 svga_arl_adjustment( const struct svga_shader_emitter *emit )
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < emit->num_arl_consts; ++i) {
       if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -1175,7 +1175,7 @@ emit_div(struct svga_shader_emitter *emit,
    const struct src_register src1 =
       translate_src_register(emit, &insn->Src[1] );
    SVGA3dShaderDestToken temp = get_temp( emit );
-   int i;
+   unsigned i;
 
    /* For each enabled element, perform a RCP instruction.  Note that
     * RCP is scalar in SVGA3D:
@@ -1822,7 +1822,7 @@ emit_tex_swizzle(struct svga_shader_emitter *emit,
    const unsigned swizzleIn[4] = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
    unsigned srcSwizzle[4];
    unsigned srcWritemask = 0x0, zeroWritemask = 0x0, oneWritemask = 0x0;
-   int i;
+   unsigned i;
 
    /* build writemasks and srcSwizzle terms */
    for (i = 0; i < 4; i++) {
@@ -3371,7 +3371,7 @@ emit_light_twoside(struct svga_shader_emitter *emit)
    struct src_register back[2];
    SVGA3dShaderDestToken color[2];
    int count = emit->internal_color_count;
-   int i;
+   unsigned i;
    SVGA3dShaderInstToken if_token;
 
    if (count == 0)
@@ -3698,7 +3698,7 @@ static boolean
 pre_parse_add_indirect( struct svga_shader_emitter *emit,
                         int num, int current_arl)
 {
-   int i;
+   unsigned i;
    assert(num < 0);
 
    for (i = 0; i < emit->num_arl_consts; ++i) {

From 0a89f307f95de3a3357d834f36c60fe803895f8a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 4 Jan 2016 13:56:39 -0800
Subject: [PATCH 114/241] vc4: Don't try the SF coalescing unless it's on a
 def.

If you want the SF of the value of a register produced from a series of
packing MOVs or conditional MOVs, we can't just SF on the last MOV into
the register.
---
 src/gallium/drivers/vc4/vc4_qir.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index c6916c48e7e..a46fb4fd3b8 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -503,9 +503,9 @@ qir_SF(struct vc4_compile *c, struct qreg src)
         if (!list_empty(&c->instructions))
                 last_inst = (struct qinst *)c->instructions.prev;
 
-        if (!last_inst ||
-            last_inst->dst.file != src.file ||
-            last_inst->dst.index != src.index ||
+        if (src.file != QFILE_TEMP ||
+            !c->defs[src.index] ||
+            last_inst != c->defs[src.index] ||
             qir_is_multi_instruction(last_inst)) {
                 src = qir_MOV(c, src);
                 last_inst = (struct qinst *)c->instructions.prev;

From 71db7d3dc577e48da3689fd66989ec3b0a069089 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 22 Dec 2015 13:37:36 -0800
Subject: [PATCH 115/241] vc4: Replace the SSA-style SEL operators with
 conditional MOVs.

I'm moving away from QIR being SSA (since NIR is doing lots of SSA
optimization for us now) and instead having QIR just be QPU operations
with virtual registers.  By making our SELs be composed of two MOVs, we
could potentially coalesce the registers for the MOV's src and dst and
eliminate the MOV.

total instructions in shared programs: 88448 -> 88028 (-0.47%)
instructions in affected programs:     39845 -> 39425 (-1.05%)
total estimated cycles in shared programs: 246306 -> 245762 (-0.22%)
estimated cycles in affected programs:     162887 -> 162343 (-0.33%)
---
 src/gallium/drivers/vc4/vc4_opt_algebraic.c |  37 ------
 src/gallium/drivers/vc4/vc4_program.c       | 140 +++++++++++---------
 src/gallium/drivers/vc4/vc4_qir.c           |  46 +++----
 src/gallium/drivers/vc4/vc4_qir.h           |  50 +++----
 src/gallium/drivers/vc4/vc4_qir_schedule.c  |   7 +-
 src/gallium/drivers/vc4/vc4_qpu_emit.c      |  49 ++-----
 6 files changed, 128 insertions(+), 201 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index aea2b9dbe87..b8ce377ff6b 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -145,43 +145,6 @@ qir_opt_algebraic(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        if (is_zero(c, inst->src[1])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= (QOP_SEL_X_Y_ZS - QOP_SEL_X_0_ZS);
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        if (is_zero(c, inst->src[0])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent, flipping the
-                                 * condition being evaluated since the operand
-                                 * order is flipped.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= QOP_SEL_X_Y_ZS;
-                                inst->op ^= 1;
-                                inst->op += QOP_SEL_X_0_ZS;
-                                inst->src[0] = inst->src[1];
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        break;
-
                 case QOP_FMIN:
                         if (is_1f(c, inst->src[1]) &&
                             inst->src[0].pack >= QPU_UNPACK_8D_REP &&
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 44e89fe64e9..c24aa19e74e 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -275,7 +275,7 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
                                    qir_uniform_f(c, 2.4));
 
         qir_SF(c, qir_FSUB(c, srgb, qir_uniform_f(c, 0.04045)));
-        return qir_SEL_X_Y_NS(c, low, high);
+        return qir_SEL(c, QPU_COND_NS, low, high);
 }
 
 static struct qreg
@@ -475,7 +475,8 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 struct qreg normalized = ntq_scale_depth_texture(c, tex);
                 struct qreg depth_output;
 
-                struct qreg one = qir_uniform_f(c, 1.0f);
+                struct qreg u0 = qir_uniform_f(c, 0.0f);
+                struct qreg u1 = qir_uniform_f(c, 1.0f);
                 if (c->key->tex[unit].compare_mode) {
                         if (has_proj)
                                 compare = qir_FMUL(c, compare, proj);
@@ -485,31 +486,31 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                 depth_output = qir_uniform_f(c, 0.0f);
                                 break;
                         case PIPE_FUNC_ALWAYS:
-                                depth_output = one;
+                                depth_output = u1;
                                 break;
                         case PIPE_FUNC_EQUAL:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
                                 break;
                         case PIPE_FUNC_NOTEQUAL:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
                                 break;
                         case PIPE_FUNC_GREATER:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                 break;
                         case PIPE_FUNC_GEQUAL:
                                 qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                 break;
                         case PIPE_FUNC_LESS:
                                 qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                 break;
                         case PIPE_FUNC_LEQUAL:
                                 qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                 break;
                         }
                 } else {
@@ -553,9 +554,8 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
         struct qreg diff = qir_FSUB(c, src, trunc);
         qir_SF(c, diff);
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
-                              diff);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
 }
 
 /**
@@ -572,9 +572,8 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, src, trunc));
 
-        return qir_SEL_X_Y_NS(c,
-                              qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
 }
 
 /**
@@ -591,9 +590,8 @@ ntq_fceil(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, trunc, src));
 
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
 }
 
 static struct qreg
@@ -668,10 +666,13 @@ ntq_fcos(struct vc4_compile *c, struct qreg src)
 static struct qreg
 ntq_fsign(struct vc4_compile *c, struct qreg src)
 {
+        struct qreg t = qir_get_temp(c);
+
         qir_SF(c, src);
-        return qir_SEL_X_Y_NC(c,
-                              qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)),
-                              qir_uniform_f(c, -1.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
+        qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
+        return t;
 }
 
 static void
@@ -888,6 +889,56 @@ ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
         return qir_UNPACK_8_I(c, base, offset_bit / 8);
 }
 
+static struct qreg
+ntq_emit_comparison(struct vc4_compile *c, nir_alu_instr *instr,
+                    struct qreg src0, struct qreg src1)
+{
+        enum qpu_cond cond;
+
+        switch (instr->op) {
+        case nir_op_feq:
+        case nir_op_ieq:
+        case nir_op_seq:
+                cond = QPU_COND_ZS;
+                break;
+        case nir_op_fne:
+        case nir_op_ine:
+        case nir_op_sne:
+                cond = QPU_COND_ZC;
+                break;
+        case nir_op_fge:
+        case nir_op_ige:
+        case nir_op_uge:
+        case nir_op_sge:
+                cond = QPU_COND_NC;
+                break;
+        case nir_op_flt:
+        case nir_op_ilt:
+        case nir_op_slt:
+                cond = QPU_COND_NS;
+                break;
+        default:
+                unreachable("bad ALU op for comparison");
+        }
+
+        if (nir_op_infos[instr->op].input_types[0] == nir_type_float)
+                qir_SF(c, qir_FSUB(c, src0, src1));
+        else
+                qir_SF(c, qir_SUB(c, src0, src1));
+
+        switch (instr->op) {
+        case nir_op_seq:
+        case nir_op_sne:
+        case nir_op_sge:
+        case nir_op_slt:
+                return qir_SEL(c, cond,
+                               qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
+        default:
+                return qir_SEL(c, cond,
+                               qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0.0));
+        }
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -974,7 +1025,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_i2b:
         case nir_op_f2b:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
+                *dest = qir_SEL(c, QPU_COND_ZC,
+                                qir_uniform_ui(c, ~0),
+                                qir_uniform_ui(c, 0));
                 break;
 
         case nir_op_iadd:
@@ -1016,65 +1069,28 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_seq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_sne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_sge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_slt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_f(c, 1.0));
-                break;
         case nir_op_feq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_fne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_fge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_flt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ieq:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ine:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ige:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_uge:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_CC(c, qir_uniform_ui(c, ~0));
-                break;
         case nir_op_ilt:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
+                *dest = ntq_emit_comparison(c, instr, src[0], src[1]);
                 break;
 
         case nir_op_bcsel:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_NS(c, src[1], src[2]);
+                *dest = qir_SEL(c, QPU_COND_NS, src[1], src[2]);
                 break;
         case nir_op_fcsel:
                 qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_ZC(c, src[1], src[2]);
+                *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
                 break;
 
         case nir_op_frcp:
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index a46fb4fd3b8..efbb69b71a7 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -65,19 +65,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_XOR] = { "xor", 1, 2 },
         [QOP_NOT] = { "not", 1, 1 },
 
-        [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1, false, true },
-        [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1, false, true },
-        [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1, false, true },
-        [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1, false, true },
-        [QOP_SEL_X_0_CS] = { "fsel_x_0_cs", 1, 1, false, true },
-        [QOP_SEL_X_0_CC] = { "fsel_x_0_cc", 1, 1, false, true },
-        [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2, false, true },
-        [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2, false, true },
-        [QOP_SEL_X_Y_CS] = { "fsel_x_y_cs", 1, 2, false, true },
-        [QOP_SEL_X_Y_CC] = { "fsel_x_y_cc", 1, 2, false, true },
-
         [QOP_RCP] = { "rcp", 1, 1, false, true },
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
@@ -219,23 +206,8 @@ qir_is_tex(struct qinst *inst)
 bool
 qir_depends_on_flags(struct qinst *inst)
 {
-        switch (inst->op) {
-        case QOP_SEL_X_0_NS:
-        case QOP_SEL_X_0_NC:
-        case QOP_SEL_X_0_ZS:
-        case QOP_SEL_X_0_ZC:
-        case QOP_SEL_X_0_CS:
-        case QOP_SEL_X_0_CC:
-        case QOP_SEL_X_Y_NS:
-        case QOP_SEL_X_Y_NC:
-        case QOP_SEL_X_Y_ZS:
-        case QOP_SEL_X_Y_ZC:
-        case QOP_SEL_X_Y_CS:
-        case QOP_SEL_X_Y_CC:
-                return true;
-        default:
-                return false;
-        }
+        return (inst->cond != QPU_COND_ALWAYS &&
+                inst->cond != QPU_COND_NEVER);
 }
 
 bool
@@ -292,8 +264,19 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 void
 qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
 {
-        fprintf(stderr, "%s%s ",
+        static const char *conditions[] = {
+                [QPU_COND_ALWAYS] = "",
+                [QPU_COND_NEVER] = ".never",
+                [QPU_COND_ZS] = ".zs",
+                [QPU_COND_ZC] = ".zc",
+                [QPU_COND_NS] = ".ns",
+                [QPU_COND_NC] = ".nc",
+                [QPU_COND_CS] = ".cs",
+                [QPU_COND_CC] = ".cc",
+        };
+        fprintf(stderr, "%s%s%s ",
                 qir_get_op_name(inst->op),
+                conditions[inst->cond],
                 inst->sf ? ".sf" : "");
 
         qir_print_reg(c, inst->dst, true);
@@ -352,6 +335,7 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
         inst->src = calloc(2, sizeof(inst->src[0]));
         inst->src[0] = src0;
         inst->src[1] = src1;
+        inst->cond = QPU_COND_ALWAYS;
 
         return inst;
 }
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index b0fbb4c1db2..9dad80dddff 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -93,23 +93,6 @@ enum qop {
         QOP_XOR,
         QOP_NOT,
 
-        /* Note: Orderings of these compares must be the same as in
-         * qpu_defines.h.  Selects the src[0] if the ns flag bit is set,
-         * otherwise 0. */
-        QOP_SEL_X_0_ZS,
-        QOP_SEL_X_0_ZC,
-        QOP_SEL_X_0_NS,
-        QOP_SEL_X_0_NC,
-        QOP_SEL_X_0_CS,
-        QOP_SEL_X_0_CC,
-        /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
-        QOP_SEL_X_Y_ZS,
-        QOP_SEL_X_Y_ZC,
-        QOP_SEL_X_Y_NS,
-        QOP_SEL_X_Y_NC,
-        QOP_SEL_X_Y_CS,
-        QOP_SEL_X_Y_CC,
-
         QOP_FTOI,
         QOP_ITOF,
         QOP_RCP,
@@ -170,6 +153,7 @@ struct qinst {
         struct qreg dst;
         struct qreg *src;
         bool sf;
+        uint8_t cond;
 };
 
 enum qstage {
@@ -463,9 +447,11 @@ void qir_schedule_instructions(struct vc4_compile *c);
 void qir_reorder_uniforms(struct vc4_compile *c);
 
 void qir_emit(struct vc4_compile *c, struct qinst *inst);
-static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+static inline struct qinst *
+qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
 {
         list_addtail(&inst->link, &c->instructions);
+        return inst;
 }
 
 struct qreg qir_get_temp(struct vc4_compile *c);
@@ -536,11 +522,12 @@ qir_##name(struct vc4_compile *c, struct qreg a)                         \
         qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
         return t;                                                        \
 }                                                                        \
-static inline void                                                       \
+static inline struct qinst *                                             \
 qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
                   struct qreg a)                                         \
 {                                                                        \
-        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
+        return qir_emit_nodef(c, qir_inst(QOP_##name, dest, a,           \
+                                          c->undef));                    \
 }
 
 #define QIR_ALU2(name)                                                   \
@@ -592,18 +579,6 @@ QIR_ALU2(V8MAX)
 QIR_ALU2(V8ADDS)
 QIR_ALU2(V8SUBS)
 QIR_ALU2(MUL24)
-QIR_ALU1(SEL_X_0_ZS)
-QIR_ALU1(SEL_X_0_ZC)
-QIR_ALU1(SEL_X_0_NS)
-QIR_ALU1(SEL_X_0_NC)
-QIR_ALU1(SEL_X_0_CS)
-QIR_ALU1(SEL_X_0_CC)
-QIR_ALU2(SEL_X_Y_ZS)
-QIR_ALU2(SEL_X_Y_ZC)
-QIR_ALU2(SEL_X_Y_NS)
-QIR_ALU2(SEL_X_Y_NC)
-QIR_ALU2(SEL_X_Y_CS)
-QIR_ALU2(SEL_X_Y_CC)
 QIR_ALU2(FMIN)
 QIR_ALU2(FMAX)
 QIR_ALU2(FMINABS)
@@ -647,6 +622,17 @@ QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
 QIR_NODST_1(MS_MASK)
 
+static inline struct qreg
+qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
+{
+        struct qreg t = qir_get_temp(c);
+        struct qinst *a = qir_MOV_dest(c, t, src0);
+        struct qinst *b = qir_MOV_dest(c, t, src1);
+        a->cond = cond;
+        b->cond = cond ^ 1;
+        return t;
+}
+
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c
index d20815f055e..2f280c54523 100644
--- a/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -250,12 +250,11 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
         else if (inst->dst.file == QFILE_TEMP)
                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
 
+        if (qir_depends_on_flags(inst))
+                add_dep(dir, state->last_sf, n);
+
         if (inst->sf)
                 add_write_dep(dir, &state->last_sf, n);
-
-        if (qir_depends_on_flags(inst)) {
-                add_dep(dir, state->last_sf, n);
-        }
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index cb4e0cfcc3f..b06702afea2 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -64,6 +64,12 @@ set_last_cond_add(struct vc4_compile *c, uint32_t cond)
         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
 }
 
+static void
+set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
+{
+        *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
+}
+
 /**
  * Some special registers can be read from either file, which lets us resolve
  * raddr conflicts without extra MOVs.
@@ -306,42 +312,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
                 }
 
+                bool handled_qinst_cond = true;
+
                 switch (qinst->op) {
-                case QOP_SEL_X_0_ZS:
-                case QOP_SEL_X_0_ZC:
-                case QOP_SEL_X_0_NS:
-                case QOP_SEL_X_0_NC:
-                case QOP_SEL_X_0_CS:
-                case QOP_SEL_X_0_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]) | unpack);
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
-                                              1) + QPU_COND_ZS);
-                        break;
-
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        if (qinst->src[0].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_MOV(dst, src[1]));
-                        if (qinst->src[1].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
-                                              1) + QPU_COND_ZS);
-
-                        break;
-
                 case QOP_RCP:
                 case QOP_RSQ:
                 case QOP_EXP2:
@@ -497,16 +470,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]) | unpack);
+                                set_last_cond_mul(c, qinst->cond);
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]) | unpack);
+                                set_last_cond_add(c, qinst->cond);
                         }
+                        handled_qinst_cond = true;
                         set_last_dst_pack(c, qinst);
 
                         break;
                 }
 
+                assert(qinst->cond == QPU_COND_ALWAYS ||
+                       handled_qinst_cond);
+
                 if (qinst->sf) {
                         assert(!qir_is_multi_instruction(qinst));
                         *last_inst(c) |= QPU_SF;

From 12519a972f53dba13289b0abebd558fd8506a539 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 18 Dec 2015 19:15:03 -0800
Subject: [PATCH 116/241] vc4: Use NIR texture lowering for texture swizzling.

We can't use its other features currently (mostly because we don't want
Newton-Raphson on rcps for texture coordinates), but it gets us started.

This eliminates some comparisons with constants in GLB2.7 and ETQW traces
at the QIR level by moving the comparisons into NIR, where they get
constant-folded out.

instructions in affected programs:     165 -> 156 (-5.45%)
total uniforms in shared programs: 32087 -> 32085 (-0.01%)
total estimated cycles in shared programs: 245762 -> 245752 (-0.00%)
estimated cycles in affected programs:     461 -> 451 (-2.17%)
---
 src/gallium/drivers/vc4/vc4_program.c | 115 +++++++++++++-------------
 src/gallium/drivers/vc4/vc4_qir.h     |   5 ++
 2 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index c24aa19e74e..9d686f72877 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -123,26 +123,6 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
         return &intr->dest.ssa;
 }
 
-nir_ssa_def *
-vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
-{
-        switch (swiz) {
-        default:
-        case UTIL_FORMAT_SWIZZLE_NONE:
-                fprintf(stderr, "warning: unknown swizzle\n");
-                /* FALLTHROUGH */
-        case UTIL_FORMAT_SWIZZLE_0:
-                return nir_imm_float(b, 0.0);
-        case UTIL_FORMAT_SWIZZLE_1:
-                return nir_imm_float(b, 1.0);
-        case UTIL_FORMAT_SWIZZLE_X:
-        case UTIL_FORMAT_SWIZZLE_Y:
-        case UTIL_FORMAT_SWIZZLE_Z:
-        case UTIL_FORMAT_SWIZZLE_W:
-                return srcs[swiz];
-        }
-}
-
 static struct qreg *
 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
@@ -338,30 +318,15 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
         struct qreg tex = qir_TEX_RESULT(c);
         c->num_texture_samples++;
 
-        struct qreg texture_output[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         enum pipe_format format = c->key->tex[unit].format;
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
                 for (int i = 0; i < 4; i++)
-                        texture_output[i] = scaled;
+                        dest[i] = scaled;
         } else {
-                struct qreg tex_result_unpacked[4];
                 for (int i = 0; i < 4; i++)
-                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-
-                const uint8_t *format_swiz =
-                        vc4_get_format_swizzle(c->key->tex[unit].format);
-                for (int i = 0; i < 4; i++) {
-                        texture_output[i] =
-                                get_swizzled_channel(c, tex_result_unpacked,
-                                                     format_swiz[i]);
-                }
-        }
-
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
-        for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
 }
 
@@ -470,7 +435,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 
         enum pipe_format format = c->key->tex[unit].format;
 
-        struct qreg unpacked[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg normalized = ntq_scale_depth_texture(c, tex);
                 struct qreg depth_output;
@@ -518,29 +483,15 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 }
 
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = depth_output;
+                        dest[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
-        const uint8_t *format_swiz = vc4_get_format_swizzle(format);
-        struct qreg texture_output[4];
         for (int i = 0; i < 4; i++) {
-                texture_output[i] = get_swizzled_channel(c, unpacked,
-                                                         format_swiz[i]);
-        }
-
-        if (util_format_is_srgb(format)) {
-                for (int i = 0; i < 3; i++)
-                        texture_output[i] = qir_srgb_decode(c,
-                                                            texture_output[i]);
-        }
-
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
-        for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
         }
 }
 
@@ -1805,6 +1756,56 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         if (stage == QSTAGE_FRAG)
                 vc4_nir_lower_blend(c);
 
+        struct nir_lower_tex_options tex_options = {
+                /* We would need to implement txs, but we don't want the
+                 * int/float conversions
+                 */
+                .lower_rect = false,
+
+                /* We want to use this, but we don't want to newton-raphson
+                 * its rcp.
+                 */
+                .lower_txp = false,
+
+                /* Apply swizzles to all samplers. */
+                .swizzle_result = ~0,
+        };
+
+        /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
+         * The format swizzling applies before sRGB decode, and
+         * ARB_texture_swizzle is the last thing before returning the sample.
+         */
+        for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
+                enum pipe_format format = c->key->tex[i].format;
+
+                if (!format)
+                        continue;
+
+                const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
+
+                for (int j = 0; j < 4; j++) {
+                        uint8_t arb_swiz = c->key->tex[i].swizzle[j];
+
+                        if (arb_swiz <= 3) {
+                                tex_options.swizzles[i][j] =
+                                        format_swizzle[arb_swiz];
+                        } else {
+                                tex_options.swizzles[i][j] = arb_swiz;
+                        }
+
+                        /* If ARB_texture_swizzle is reading from the R, G, or
+                         * B channels of an sRGB texture, then we need to
+                         * apply sRGB decode to this channel at sample time.
+                         */
+                        if (arb_swiz < 3 && util_format_is_srgb(format)) {
+                                c->tex_srgb_decode[i] |= (1 << j);
+                        }
+
+                }
+        }
+
+        nir_lower_tex(c->s, &tex_options);
+
         if (c->fs_key && c->fs_key->light_twoside)
                 nir_lower_two_sided_color(c->s);
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 9dad80dddff..4ab4d35d0ca 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -369,6 +369,11 @@ struct vc4_compile {
 
         uint8_t vattr_sizes[8];
 
+        /* Bitfield for whether a given channel of a sampler needs sRGB
+         * decode.
+         */
+        uint8_t tex_srgb_decode[VC4_MAX_TEXTURE_SAMPLERS];
+
         /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *

From f01ca9eeda266af8c622b07b92543aae802c9fed Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 5 Jan 2016 16:25:07 -0800
Subject: [PATCH 117/241] vc4: Add support for GL_ARB_texture_swizzle.

We already had the code supporting it, since it's needed for the depth
mode when doing shadow comparisons.
---
 src/gallium/drivers/vc4/vc4_screen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index a4acf2cdb0f..2ee5a777d20 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -100,6 +100,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TWO_SIDED_STENCIL:
         case PIPE_CAP_USER_INDEX_BUFFERS:
         case PIPE_CAP_TEXTURE_MULTISAMPLE:
+        case PIPE_CAP_TEXTURE_SWIZZLE:
                 return 1;
 
                 /* lying for GL 2.0 */
@@ -128,7 +129,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
         case PIPE_CAP_CUBE_MAP_ARRAY:
         case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-        case PIPE_CAP_TEXTURE_SWIZZLE:
         case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
         case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
         case PIPE_CAP_SEAMLESS_CUBE_MAP:

From 7a9eb76786ea5534d395eef199974f8221e047d8 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 5 Jan 2016 16:36:28 -0800
Subject: [PATCH 118/241] vc4: Add missing sRGB decode to texel fetches.

We only see txf on MSAA textures, currently, and apparently this didn't
impact any of our piglit tests.
---
 src/gallium/drivers/vc4/vc4_program.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 9d686f72877..e04cca1c09d 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -328,6 +328,11 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
                 for (int i = 0; i < 4; i++)
                         dest[i] = qir_UNPACK_8_F(c, tex, i);
         }
+
+        for (int i = 0; i < 4; i++) {
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
+        }
 }
 
 static void

From 25aa436e8690dea181049e312e3e7f5f0c9d45da Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 5 Jan 2016 17:18:09 -0800
Subject: [PATCH 119/241] vc4: Optimize out a comparison for bcsel based on an
 ALU comparison

We routinely have code like:

	vec1 ssa_220 = fge ssa_104, ssa_61
	vec1 ssa_199 = bcsel ssa_220, ssa_106, ssa_105

and we would compare fge's args and choose between ~0 and 0 to generate
ssa_220, then compare ssa_220 to 0 and choose between bcsel's args.
Instead, try to notice the pattern and compare between fge's args to
select between bcsel's args.

total instructions in shared programs: 88019 -> 87574 (-0.51%)
instructions in affected programs:     9985 -> 9540 (-4.46%)
total estimated cycles in shared programs: 245752 -> 245237 (-0.21%)
estimated cycles in affected programs:     17232 -> 16717 (-2.99%)
---
 src/gallium/drivers/vc4/vc4_program.c | 73 ++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index e04cca1c09d..ede14ab9b1d 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -845,13 +845,19 @@ ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
         return qir_UNPACK_8_I(c, base, offset_bit / 8);
 }
 
-static struct qreg
-ntq_emit_comparison(struct vc4_compile *c, nir_alu_instr *instr,
-                    struct qreg src0, struct qreg src1)
+/**
+ * If compare_instr is a valid comparison instruction, emits the
+ * compare_instr's comparison and returns the sel_instr's return value based
+ * on the compare_instr's result.
+ */
+static bool
+ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
+                    nir_alu_instr *compare_instr,
+                    nir_alu_instr *sel_instr)
 {
         enum qpu_cond cond;
 
-        switch (instr->op) {
+        switch (compare_instr->op) {
         case nir_op_feq:
         case nir_op_ieq:
         case nir_op_seq:
@@ -874,25 +880,63 @@ ntq_emit_comparison(struct vc4_compile *c, nir_alu_instr *instr,
                 cond = QPU_COND_NS;
                 break;
         default:
-                unreachable("bad ALU op for comparison");
+                return false;
         }
 
-        if (nir_op_infos[instr->op].input_types[0] == nir_type_float)
+        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
+        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
+
+        if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float)
                 qir_SF(c, qir_FSUB(c, src0, src1));
         else
                 qir_SF(c, qir_SUB(c, src0, src1));
 
-        switch (instr->op) {
+        switch (sel_instr->op) {
         case nir_op_seq:
         case nir_op_sne:
         case nir_op_sge:
         case nir_op_slt:
-                return qir_SEL(c, cond,
-                               qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
+                break;
+
+        case nir_op_bcsel:
+                *dest = qir_SEL(c, cond,
+                                ntq_get_alu_src(c, sel_instr, 1),
+                                ntq_get_alu_src(c, sel_instr, 2));
+                break;
+
         default:
-                return qir_SEL(c, cond,
-                               qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0.0));
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
+                break;
         }
+
+        return true;
+}
+
+/**
+ * Attempts to fold a comparison generating a boolean result into the
+ * condition code for selecting between two values, instead of comparing the
+ * boolean result against 0 to generate the condition code.
+ */
+static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
+                                  struct qreg *src)
+{
+        if (!instr->src[0].src.is_ssa)
+                goto out;
+        nir_alu_instr *compare =
+                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        if (!compare)
+                goto out;
+
+        struct qreg dest;
+        if (ntq_emit_comparison(c, &dest, compare, instr))
+                return dest;
+
+out:
+        qir_SF(c, src[0]);
+        return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
 }
 
 static void
@@ -1037,12 +1081,13 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_ige:
         case nir_op_uge:
         case nir_op_ilt:
-                *dest = ntq_emit_comparison(c, instr, src[0], src[1]);
+                if (!ntq_emit_comparison(c, dest, instr, instr)) {
+                        fprintf(stderr, "Bad comparison instruction\n");
+                }
                 break;
 
         case nir_op_bcsel:
-                qir_SF(c, src[0]);
-                *dest = qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+                *dest = ntq_emit_bcsel(c, instr, src);
                 break;
         case nir_op_fcsel:
                 qir_SF(c, src[0]);

From bbd29f13759e41ef14a77daa179a7c294a6aaa41 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 6 Jan 2016 12:48:19 -0800
Subject: [PATCH 120/241] vc4: Fix driver build from last minute rebase fix.

I had the driver all tested for the last series, and in my last build I
noticed that get_swizzled_channel was unused now, and removed
it... apparently without testing to find that I removed the wrong channel
swizzle function.
---
 src/gallium/drivers/vc4/vc4_program.c | 41 +++++++++++++--------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ede14ab9b1d..3e402d048ba 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -123,6 +123,26 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
         return &intr->dest.ssa;
 }
 
+nir_ssa_def *
+vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case UTIL_FORMAT_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case UTIL_FORMAT_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case UTIL_FORMAT_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case UTIL_FORMAT_SWIZZLE_X:
+        case UTIL_FORMAT_SWIZZLE_Y:
+        case UTIL_FORMAT_SWIZZLE_Z:
+        case UTIL_FORMAT_SWIZZLE_W:
+                return srcs[swiz];
+        }
+}
+
 static struct qreg *
 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
@@ -184,27 +204,6 @@ ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
         return r;
 };
 
-static struct qreg
-get_swizzled_channel(struct vc4_compile *c,
-                     struct qreg *srcs, int swiz)
-{
-        switch (swiz) {
-        default:
-        case UTIL_FORMAT_SWIZZLE_NONE:
-                fprintf(stderr, "warning: unknown swizzle\n");
-                /* FALLTHROUGH */
-        case UTIL_FORMAT_SWIZZLE_0:
-                return qir_uniform_f(c, 0.0);
-        case UTIL_FORMAT_SWIZZLE_1:
-                return qir_uniform_f(c, 1.0);
-        case UTIL_FORMAT_SWIZZLE_X:
-        case UTIL_FORMAT_SWIZZLE_Y:
-        case UTIL_FORMAT_SWIZZLE_Z:
-        case UTIL_FORMAT_SWIZZLE_W:
-                return srcs[swiz];
-        }
-}
-
 static inline struct qreg
 qir_SAT(struct vc4_compile *c, struct qreg val)
 {

From 0d7477a2899fe74134cd3e12e7adf1eec0c77f7e Mon Sep 17 00:00:00 2001
From: Krzysztof Sobiecki <sobkas@gmail.com>
Date: Tue, 29 Dec 2015 20:27:44 +0100
Subject: [PATCH 121/241] gallium/r600: Replace ALIGN_DIVUP with DIV_ROUND_UP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ALIGN_DIVUP is a driver specific(r600g) macro that duplicates DIV_ROUND_UP functionality.
Replacing it with DIV_ROUND_UP eliminates this problems.

Signed-off-by: Krzysztof A. Sobiecki <sobkas@gmail.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/evergreen_state.c | 2 +-
 src/gallium/drivers/r600/r600_pipe.h       | 1 -
 src/gallium/drivers/r600/r600_state.c      | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 1aee7dd2da8..9dfb84965cf 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1956,7 +1956,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 
 		if (!gs_ring_buffer) {
 			radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4,
-						    ALIGN_DIVUP(cb->buffer_size, 256), pkt_flags);
+						    DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
 			radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8,
 						    pkt_flags);
 		}
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 31f2a729494..0e4dd16525b 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -946,7 +946,6 @@ static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
 {
 	return value * (1 << frac_bits);
 }
-#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
 /* 12.4 fixed-point */
 static inline unsigned r600_pack_float_12p4(float x)
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 43b80742cb5..f60e30486a2 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1768,7 +1768,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 
 		if (!gs_ring_buffer) {
 			radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4,
-					       ALIGN_DIVUP(cb->buffer_size, 256));
+					       DIV_ROUND_UP(cb->buffer_size, 256));
 			radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8);
 		}
 

From 30991d7389b6b475625ccce2aaae2f3f8cf8e95f Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 22 Dec 2015 10:14:45 +1100
Subject: [PATCH 122/241] glsl: remove unused varyings before packing them

Previously we would pack varyings before trying to remove them, this
relied on the packing pass not packing varyings with a location of -1
to avoid packing varyings that should be removed.
However this meant unused varyings with an explicit location would be
packed before they could be removed when we enable packing of them in a
later patch.

V2: fix regression in V1 removing unused varyings in multi-stage SSO,
fix regression with single stage programs.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_varyings.cpp | 45 +++++++++++++++++++++++++++++++++
 src/glsl/link_varyings.h   |  5 ++++
 src/glsl/linker.cpp        | 52 +++-----------------------------------
 3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index c43abbcc6a5..a80a1b5660d 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -309,6 +309,41 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
    }
 }
 
+/**
+ * Demote shader inputs and outputs that are not used in other stages, and
+ * remove them via dead code elimination.
+ */
+void
+remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
+                                        gl_shader *sh,
+                                        enum ir_variable_mode mode)
+{
+   if (is_separate_shader_object)
+      return;
+
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *const var = node->as_variable();
+
+      if ((var == NULL) || (var->data.mode != int(mode)))
+	 continue;
+
+      /* A shader 'in' or 'out' variable is only really an input or output if
+       * its value is used by other shader stages. This will cause the
+       * variable to have a location assigned.
+       */
+      if (var->data.is_unmatched_generic_inout) {
+         assert(var->data.mode != ir_var_temporary);
+	 var->data.mode = ir_var_auto;
+      }
+   }
+
+   /* Eliminate code that is now dead due to unused inputs/outputs being
+    * demoted.
+    */
+   while (do_dead_code(sh->ir, false))
+      ;
+
+}
 
 /**
  * Initialize this object based on a string that was passed to
@@ -1671,6 +1706,16 @@ assign_varying_locations(struct gl_context *ctx,
             }
          }
       }
+
+      /* Now that validation is done its safe to remove unused varyings. As
+       * we have both a producer and consumer its safe to remove unused
+       * varyings even if the program is a SSO because the stages are being
+       * linked together i.e. we have a multi-stage SSO.
+       */
+      remove_unused_shader_inputs_and_outputs(false, producer,
+                                              ir_var_shader_out);
+      remove_unused_shader_inputs_and_outputs(false, consumer,
+                                              ir_var_shader_in);
    }
 
    if (!disable_varying_packing) {
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h
index 1d12978fa30..b2812614ecc 100644
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -268,6 +268,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                       const void *mem_ctx, unsigned num_names,
                       char **varying_names, tfeedback_decl *decls);
 
+void
+remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
+                                        gl_shader *sh,
+                                        enum ir_variable_mode mode);
+
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index acc63ae8cf6..7a18523fe23 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2722,30 +2722,6 @@ match_explicit_outputs_to_inputs(struct gl_shader_program *prog,
    }
 }
 
-/**
- * Demote shader inputs and outputs that are not used in other stages
- */
-void
-demote_shader_inputs_and_outputs(gl_shader *sh, enum ir_variable_mode mode)
-{
-   foreach_in_list(ir_instruction, node, sh->ir) {
-      ir_variable *const var = node->as_variable();
-
-      if ((var == NULL) || (var->data.mode != int(mode)))
-	 continue;
-
-      /* A shader 'in' or 'out' variable is only really an input or output if
-       * its value is used by other shader stages.  This will cause the variable
-       * to have a location assigned.
-       */
-      if (var->data.is_unmatched_generic_inout) {
-         assert(var->data.mode != ir_var_temporary);
-	 var->data.mode = ir_var_auto;
-      }
-   }
-}
-
-
 /**
  * Store the gl_FragDepth layout in the gl_shader_program struct.
  */
@@ -4446,14 +4422,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       do_dead_builtin_varyings(ctx, sh, NULL,
                                num_tfeedback_decls, tfeedback_decls);
 
-      if (!prog->SeparateShader) {
-         demote_shader_inputs_and_outputs(sh, ir_var_shader_out);
-         /* Eliminate code that is now dead due to unused outputs being
-          * demoted.
-          */
-         while (do_dead_code(sh->ir, false))
-            ;
-      }
+      remove_unused_shader_inputs_and_outputs(prog->SeparateShader, sh,
+                                              ir_var_shader_out);
    }
    else if (first == MESA_SHADER_FRAGMENT) {
       /* If the program only contains a fragment shader...
@@ -4471,12 +4441,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                        NULL /* tfeedback_decls */))
             goto done;
       } else {
-         demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
-         /* Eliminate code that is now dead due to unused inputs being
-          * demoted.
-          */
-         while (do_dead_code(sh->ir, false))
-            ;
+         remove_unused_shader_inputs_and_outputs(false, sh,
+                                                 ir_var_shader_in);
       }
    }
 
@@ -4497,16 +4463,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                 next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
                 tfeedback_decls);
 
-      demote_shader_inputs_and_outputs(sh_i, ir_var_shader_out);
-      demote_shader_inputs_and_outputs(sh_next, ir_var_shader_in);
-
-      /* Eliminate code that is now dead due to unused outputs being demoted.
-       */
-      while (do_dead_code(sh_i->ir, false))
-         ;
-      while (do_dead_code(sh_next->ir, false))
-         ;
-
       /* This must be done after all dead varyings are eliminated. */
       if (!check_against_output_limit(ctx, prog, sh_i))
          goto done;

From 5907a02ab6fbe20b4ba58eb00bf93261129798d5 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 18 Dec 2015 13:53:27 +1100
Subject: [PATCH 123/241] glsl: create helper to remove outer vertex index
 array used by some stages

This will be used in the following patch for calculating array sizes correctly
when reserving explicit varying locations.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_varyings.cpp | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index a80a1b5660d..863a3995c00 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -40,6 +40,29 @@
 #include "program.h"
 
 
+/**
+ * Get the varying type stripped of the outermost array if we're processing
+ * a stage whose varyings are arrays indexed by a vertex number (such as
+ * geometry shader inputs).
+ */
+static const glsl_type *
+get_varying_type(const ir_variable *var, gl_shader_stage stage)
+{
+   const glsl_type *type = var->type;
+
+   if (!var->data.patch &&
+       ((var->data.mode == ir_var_shader_out &&
+         stage == MESA_SHADER_TESS_CTRL) ||
+        (var->data.mode == ir_var_shader_in &&
+         (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_TESS_EVAL ||
+          stage == MESA_SHADER_GEOMETRY)))) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   return type;
+}
+
 /**
  * Validate the types and qualifiers of an output from one stage against the
  * matching input to another stage.
@@ -981,18 +1004,11 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
    if (this->disable_varying_packing) {
-      const struct glsl_type *type = var->type;
       unsigned slots;
+      gl_shader_stage stage =
+         (producer_var != NULL) ? producer_stage : consumer_stage;
 
-      /* Some shader stages have 2-dimensional varyings. Use the inner type. */
-      if (!var->data.patch &&
-          ((var == producer_var && producer_stage == MESA_SHADER_TESS_CTRL) ||
-           (var == consumer_var && (consumer_stage == MESA_SHADER_TESS_CTRL ||
-                                    consumer_stage == MESA_SHADER_TESS_EVAL ||
-                                    consumer_stage == MESA_SHADER_GEOMETRY)))) {
-         assert(type->is_array());
-         type = type->fields.array;
-      }
+      const glsl_type *type = get_varying_type(var, stage);
 
       slots = type->count_attribute_slots(false);
       this->matches[this->num_matches].num_components = slots * 4;

From ac6e2c2056469226fdeefb96bee632546f45a0fb Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 15 Dec 2015 16:40:26 +1100
Subject: [PATCH 124/241] glsl: fix overlapping of varying locations for arrays
 and structs

Previously we were only reserving a single location for arrays and
structs.

We also didn't take into account implicit locations clashing with
explicit locations when assigning locations for their arrays or
structs.

This patch fixes both issues.

V5: fix regression for patch inputs/outputs in tessellation shaders
V4: just use count_attribute_slots() to get the number of slots,
also calculate the correct number of slots to reserve for gs and
tess stages by making use of the new get_varying_type() helper.
V3: handle arrays of structs
V2: also fix for arrays of arrays and structs.

Acked-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_varyings.cpp | 79 ++++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 12 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 863a3995c00..1da0c9e5527 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -825,7 +825,8 @@ public:
                    gl_shader_stage consumer_stage);
    ~varying_matches();
    void record(ir_variable *producer_var, ir_variable *consumer_var);
-   unsigned assign_locations(uint64_t reserved_slots, bool separate_shader);
+   unsigned assign_locations(struct gl_shader_program *prog,
+                             uint64_t reserved_slots, bool separate_shader);
    void store_locations() const;
 
 private:
@@ -1031,7 +1032,9 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
  * passed to varying_matches::record().
  */
 unsigned
-varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
+varying_matches::assign_locations(struct gl_shader_program *prog,
+                                  uint64_t reserved_slots,
+                                  bool separate_shader)
 {
    /* We disable varying sorting for separate shader programs for the
     * following reasons:
@@ -1068,10 +1071,20 @@ varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
    for (unsigned i = 0; i < this->num_matches; i++) {
       unsigned *location = &generic_location;
 
-      if ((this->matches[i].consumer_var &&
-           this->matches[i].consumer_var->data.patch) ||
-          (this->matches[i].producer_var &&
-           this->matches[i].producer_var->data.patch))
+      const ir_variable *var;
+      const glsl_type *type;
+      bool is_vertex_input = false;
+      if (matches[i].consumer_var) {
+         var = matches[i].consumer_var;
+         type = get_varying_type(var, consumer_stage);
+         if (consumer_stage == MESA_SHADER_VERTEX)
+            is_vertex_input = true;
+      } else {
+         var = matches[i].producer_var;
+         type = get_varying_type(var, producer_stage);
+      }
+
+      if (var->data.patch)
          location = &generic_patch_location;
 
       /* Advance to the next slot if this varying has a different packing
@@ -1083,9 +1096,45 @@ varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
           != this->matches[i].packing_class) {
          *location = ALIGN(*location, 4);
       }
-      while ((*location < MAX_VARYING * 4u) &&
-            (reserved_slots & (1u << *location / 4u))) {
-         *location = ALIGN(*location + 1, 4);
+
+      unsigned num_elements =  type->count_attribute_slots(is_vertex_input);
+      unsigned slot_end = this->disable_varying_packing ? 4 :
+         type->without_array()->vector_elements;
+      slot_end += *location - 1;
+
+      /* FIXME: We could be smarter in the below code and loop back over
+       * trying to fill any locations that we skipped because we couldn't pack
+       * the varying between an explicit location. For now just let the user
+       * hit the linking error if we run out of room and suggest they use
+       * explicit locations.
+       */
+      for (unsigned j = 0; j < num_elements; j++) {
+         while ((slot_end < MAX_VARYING * 4u) &&
+                ((reserved_slots & (1u << *location / 4u) ||
+                 (reserved_slots & (1u << slot_end / 4u))))) {
+
+            *location = ALIGN(*location + 1, 4);
+            slot_end = *location;
+
+            /* reset the counter and try again */
+            j = 0;
+         }
+
+         /* Increase the slot to make sure there is enough room for next
+          * array element.
+          */
+         if (this->disable_varying_packing)
+            slot_end += 4;
+         else
+            slot_end += type->without_array()->vector_elements;
+      }
+
+      if (!var->data.patch && *location >= MAX_VARYING * 4u) {
+         linker_error(prog, "insufficient contiguous locations available for "
+                      "%s it is possible an array or struct could not be "
+                      "packed between varyings with explicit locations. Try "
+                      "using an explicit location for arrays and structs.",
+                      var->name);
       }
 
       this->matches[i].generic_location = *location;
@@ -1473,8 +1522,14 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
          continue;
 
       var_slot = var->data.location - VARYING_SLOT_VAR0;
-      if (var_slot >= 0 && var_slot < MAX_VARYING)
-         slots |= 1u << var_slot;
+
+      unsigned num_elements = get_varying_type(var, stage->Stage)
+         ->count_attribute_slots(stage->Stage == MESA_SHADER_VERTEX);
+      for (unsigned i = 0; i < num_elements; i++) {
+         if (var_slot >= 0 && var_slot < MAX_VARYING)
+            slots |= 1u << var_slot;
+         var_slot += 1;
+      }
    }
 
    return slots;
@@ -1660,7 +1715,7 @@ assign_varying_locations(struct gl_context *ctx,
       reserved_varying_slot(producer, ir_var_shader_out) |
       reserved_varying_slot(consumer, ir_var_shader_in);
 
-   const unsigned slots_used = matches.assign_locations(reserved_slots,
+   const unsigned slots_used = matches.assign_locations(prog, reserved_slots,
                                                         prog->SeparateShader);
    matches.store_locations();
 

From 47dde2bd45eb5053042a20f70c6f0b7a86ebf1b1 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 15 Dec 2015 16:23:29 +1100
Subject: [PATCH 125/241] glsl: don't try adding built-ins to explicit
 locations bitmask

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/link_varyings.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 1da0c9e5527..8763cc5b07d 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1518,7 +1518,9 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
    foreach_in_list(ir_instruction, node, stage->ir) {
       ir_variable *const var = node->as_variable();
 
-      if (var == NULL || var->data.mode != io_mode || !var->data.explicit_location)
+      if (var == NULL || var->data.mode != io_mode ||
+          !var->data.explicit_location ||
+          var->data.location < VARYING_SLOT_VAR0)
          continue;
 
       var_slot = var->data.location - VARYING_SLOT_VAR0;

From e58be8ac0e7ce53ed02721e1432a15f95b026b57 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 6 Jan 2016 20:22:46 +1100
Subject: [PATCH 126/241] glsl: fix varying slot allocation for blocks and
 structs with explicit locations

Previously each member was being counted as using a single slot,
count_attribute_slots() fixes the count for array and struct members.

Also don't assign a negitive to the unsigned expl_location variable.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/ast_to_hir.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index dbf05ac9999..e6aec3654b8 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -6375,12 +6375,13 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
             if (process_qualifier_constant(state, &loc, "location",
                                            qual->location, &qual_location)) {
                fields[i].location = VARYING_SLOT_VAR0 + qual_location;
-               expl_location = fields[i].location + 1;
+               expl_location = fields[i].location +
+                  fields[i].type->count_attribute_slots(false);
             }
          } else {
             if (layout && layout->flags.q.explicit_location) {
                fields[i].location = expl_location;
-               expl_location = expl_location + 1;
+               expl_location += fields[i].type->count_attribute_slots(false);
             } else {
                fields[i].location = -1;
             }
@@ -6484,7 +6485,7 @@ ast_struct_specifier::hir(exec_list *instructions,
 
    state->struct_specifier_depth++;
 
-   unsigned expl_location = -1;
+   unsigned expl_location = 0;
    if (layout && layout->flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
                                       layout->location, &expl_location)) {
@@ -6671,7 +6672,7 @@ ast_interface_block::hir(exec_list *instructions,
       return NULL;
    }
 
-   unsigned expl_location = -1;
+   unsigned expl_location = 0;
    if (layout.flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
                                       layout.location, &expl_location)) {

From 72d6bbca5b0f646b7278af1eaf32c4e5f24ccf8f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 17:38:00 -0700
Subject: [PATCH 127/241] st/mesa: fix comment indentation in
 st_flush_bitmap_cache()

---
 src/mesa/state_tracker/st_cb_bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 14e8354d480..4d6b8d6233c 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -501,8 +501,8 @@ st_flush_bitmap_cache(struct st_context *st)
 */
 
       /* The texture transfer has been mapped until now.
-          * So unmap and release the texture transfer before drawing.
-          */
+       * So unmap and release the texture transfer before drawing.
+       */
       if (cache->trans && cache->buffer) {
          if (0)
             print_cache(cache);

From c75d00e054c02ac32321ee0a4e6e2932ad2ad6ad Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 17:26:29 -0700
Subject: [PATCH 128/241] st/mesa: protect debug printf() with a conditional
 instead of comment

---
 src/mesa/state_tracker/st_cb_bitmap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 4d6b8d6233c..c2cbcbd6fc7 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -494,11 +494,11 @@ st_flush_bitmap_cache(struct st_context *st)
 
       assert(cache->xmin <= cache->xmax);
 
-/*    printf("flush size %d x %d  at %d, %d\n",
-             cache->xmax - cache->xmin,
-             cache->ymax - cache->ymin,
-             cache->xpos, cache->ypos);
-*/
+      if (0)
+         printf("flush bitmap, size %d x %d  at %d, %d\n",
+                cache->xmax - cache->xmin,
+                cache->ymax - cache->ymin,
+                cache->xpos, cache->ypos);
 
       /* The texture transfer has been mapped until now.
        * So unmap and release the texture transfer before drawing.

From c28d72a3473ad0127c82c1244b6688dcc184e85e Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 17:10:12 -0700
Subject: [PATCH 129/241] st/mesa: check state->mesa in early return check in
 st_validate_state()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We were checking the dirty->st flags but not the dirty->mesa flags.
When we took the early return, we didn't clear the dirty->mesa flags
so the next time we called st_validate_state() we'd often flush the
glBitmap cache.  And since st_validate_state() is called from
st_Bitmap(), it meant we flushed the bitmap cache for every glBitmap()
call.

This change seems to recover most of the performance loss observed
with the ipers demo on llvmpipe since commit commit 36c93a6fae27561.

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_atom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 43dbadd4a7e..c1a9d00969f 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -188,7 +188,7 @@ void st_validate_state( struct st_context *st )
 
    st_manager_validate_framebuffers(st);
 
-   if (state->st == 0)
+   if (state->st == 0 && state->mesa == 0)
       return;
 
    /*printf("%s %x/%x\n", __func__, state->mesa, state->st);*/

From b6bcf0864138787c21b19cda3749c80c6ad74604 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 18:28:57 -0700
Subject: [PATCH 130/241] st/mesa: move bitmap cache flushing out of state
 validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Just do it where needed (before drawing, clearing, etc).

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_atom.c          | 4 ----
 src/mesa/state_tracker/st_cb_clear.c      | 3 +++
 src/mesa/state_tracker/st_cb_drawpixels.c | 5 +++++
 src/mesa/state_tracker/st_cb_drawtex.c    | 3 +++
 src/mesa/state_tracker/st_draw.c          | 3 +++
 src/mesa/state_tracker/st_draw_feedback.c | 3 +++
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index c1a9d00969f..337213c1b80 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -33,7 +33,6 @@
 #include "pipe/p_defines.h"
 #include "st_context.h"
 #include "st_atom.h"
-#include "st_cb_bitmap.h"
 #include "st_program.h"
 #include "st_manager.h"
 
@@ -181,9 +180,6 @@ void st_validate_state( struct st_context *st )
 
    check_attrib_edgeflag(st);
 
-   if (state->mesa)
-      st_flush_bitmap_cache(st);
-
    check_program_state( st );
 
    st_manager_validate_framebuffers(st);
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index e09f5ec6a0b..7b6d10e76b1 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -41,6 +41,7 @@
 #include "program/prog_instruction.h"
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_clear.h"
 #include "st_cb_fbo.h"
 #include "st_format.h"
@@ -466,6 +467,8 @@ st_Clear(struct gl_context *ctx, GLbitfield mask)
    GLbitfield clear_buffers = 0x0;
    GLuint i;
 
+   st_flush_bitmap_cache(st);
+
    /* This makes sure the pipe has the latest scissor, etc values */
    st_validate_state( st );
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 86e8a55e25e..7ed52dd2600 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -50,6 +50,7 @@
 
 #include "st_atom.h"
 #include "st_atom_constbuf.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_drawpixels.h"
 #include "st_cb_readpixels.h"
 #include "st_cb_fbo.h"
@@ -1063,6 +1064,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    /* Mesa state should be up to date by now */
    assert(ctx->NewState == 0x0);
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    /* Limit the size of the glDrawPixels to the max texture size.
@@ -1422,6 +1425,8 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
    GLint readX, readY, readW, readH;
    struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    if (type == GL_DEPTH_STENCIL) {
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index b3e4b5bb70c..e6ab77fb521 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -21,6 +21,7 @@
 
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_drawtex.h"
 
 #include "pipe/p_context.h"
@@ -113,6 +114,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    struct pipe_vertex_element velements[2 + MAX_TEXTURE_UNITS];
    unsigned offset;
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    /* determine if we need vertex color */
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 63b46222e6b..d7a97169bc2 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -48,6 +48,7 @@
 
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_bufferobjects.h"
 #include "st_cb_xformfb.h"
 #include "st_debug.h"
@@ -197,6 +198,8 @@ st_draw_vbo(struct gl_context *ctx,
    /* Mesa core state should have been validated already */
    assert(ctx->NewState == 0x0);
 
+   st_flush_bitmap_cache(st);
+
    /* Validate state. */
    if (st->dirty.st || ctx->NewDriverState) {
       st_validate_state(st);
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index 88c10a8f150..b6e6dea5b27 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -33,6 +33,7 @@
 
 #include "st_context.h"
 #include "st_atom.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_bufferobjects.h"
 #include "st_draw.h"
 #include "st_program.h"
@@ -137,6 +138,8 @@ st_feedback_draw_vbo(struct gl_context *ctx,
 
    assert(draw);
 
+   st_flush_bitmap_cache(st);
+
    st_validate_state(st);
 
    if (!index_bounds_valid)

From 2cc52801c05a636ddd52cdef7df338f69607c6fe Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Jan 2016 18:11:14 -0700
Subject: [PATCH 131/241] st/mesa: be more careful about state validation in
 st_Bitmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the only dirty state is mesa's _NEW_PROGRAM_CONSTANTS flag, we can
skip state validation before drawing a bitmap since that state doesn't
effect bitmap rendering.

This further increases the performance of the ipers demo on llvmpipe
to about what it was before commit 36c93a6fae27561.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_cb_bitmap.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index c2cbcbd6fc7..191f1443131 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -622,7 +622,14 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
    if (width == 0 || height == 0)
       return;
 
-   st_validate_state(st);
+   /* We only need to validate state of the st dirty flags are set or
+    * any non-_NEW_PROGRAM_CONSTANTS mesa flags are set.  The VS we use
+    * for bitmap drawing uses no constants and the FS constants are
+    * explicitly uploaded in the draw_bitmap_quad() function.
+    */
+   if ((st->dirty.mesa & ~_NEW_PROGRAM_CONSTANTS) || st->dirty.st) {
+      st_validate_state(st);
+   }
 
    if (!st->bitmap.vs) {
       /* create pass-through vertex shader now */

From 4cd1bd46edff7cb2c195c2dc19d74b1ba44122d2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 6 Jan 2016 08:33:36 -0700
Subject: [PATCH 132/241] s/GLuint/GLbitfield/ for st_invalidate_state()
 parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To match dd_function_table::UpdateState().

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_context.c | 2 +-
 src/mesa/state_tracker/st_context.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 31cc99dca89..e12c1663d3f 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -99,7 +99,7 @@ static void st_Enable(struct gl_context * ctx, GLenum cap, GLboolean state)
 /**
  * Called via ctx->Driver.UpdateState()
  */
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state)
+void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)
 {
    struct st_context *st = st_context(ctx);
 
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 276fa63223e..35c89321a54 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -252,7 +252,7 @@ struct st_framebuffer
 extern void st_init_driver_functions(struct pipe_screen *screen,
                                      struct dd_function_table *functions);
 
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
+void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state);
 
 
 

From 3c0521cd0fe9b82c9481f9652bb971bd625d1e40 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 6 Jan 2016 08:32:02 -0700
Subject: [PATCH 133/241] st/mesa: use GLbitfield in st_state_flags, add
 comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use GLbitfield instead of GLuint to be consistent with other variables.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_context.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 35c89321a54..91b0f975f3f 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -65,8 +65,8 @@ struct u_upload_mgr;
 
 
 struct st_state_flags {
-   GLuint mesa;
-   uint64_t st;
+   GLbitfield mesa;  /**< Mask of _NEW_x flags */
+   uint64_t st;      /**< Mask of ST_NEW_x flags */
 };
 
 struct st_tracked_state {

From c81ddc2092cceb07178a9554aabc5ecf92d15557 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 6 Jan 2016 08:38:03 -0700
Subject: [PATCH 134/241] vbo: s/GLuint/GLbitfield/ for state bitmasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/vbo/vbo.h         | 2 +-
 src/mesa/vbo/vbo_context.c | 2 +-
 src/mesa/vbo/vbo_exec.c    | 2 +-
 src/mesa/vbo/vbo_exec.h    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index cef3b8cd792..dd9b428b104 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -78,7 +78,7 @@ struct _mesa_index_buffer {
 
 GLboolean _vbo_CreateContext( struct gl_context *ctx );
 void _vbo_DestroyContext( struct gl_context *ctx );
-void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state );
+void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state );
 
 
 void
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index 5e1a760eb2c..19b35a429b3 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -186,7 +186,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
 }
 
 
-void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state )
+void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state )
 {
    vbo_exec_invalidate_state(ctx, new_state);
 }
diff --git a/src/mesa/vbo/vbo_exec.c b/src/mesa/vbo/vbo_exec.c
index a301c6c9a22..4db4f4088b9 100644
--- a/src/mesa/vbo/vbo_exec.c
+++ b/src/mesa/vbo/vbo_exec.c
@@ -73,7 +73,7 @@ void vbo_exec_destroy( struct gl_context *ctx )
  * invoked according to the state flags.  That will have to wait for a
  * mesa rework:
  */ 
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state )
+void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state )
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index a80b2c908d1..27bff4a2aa9 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -146,7 +146,7 @@ struct vbo_exec_context
  */
 void vbo_exec_init( struct gl_context *ctx );
 void vbo_exec_destroy( struct gl_context *ctx );
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state );
+void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
 
 
 /* Internal functions:

From 0d39b5fc3b5ca186814a23c07987570800aa17ec Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 6 Jan 2016 08:38:33 -0700
Subject: [PATCH 135/241] main: s/GLuint/GLbitfield for state bitmasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/main/api_arrayelt.c | 4 ++--
 src/mesa/main/api_arrayelt.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/api_arrayelt.c b/src/mesa/main/api_arrayelt.c
index 92d8238f431..c84db5f97f6 100644
--- a/src/mesa/main/api_arrayelt.c
+++ b/src/mesa/main/api_arrayelt.c
@@ -65,7 +65,7 @@ typedef struct {
 typedef struct {
    AEarray arrays[32];
    AEattrib attribs[VERT_ATTRIB_MAX + 1];
-   GLuint NewState;
+   GLbitfield NewState;
 
    /* List of VBOs we need to map before executing ArrayElements */
    struct gl_buffer_object *vbo[VERT_ATTRIB_MAX];
@@ -1802,7 +1802,7 @@ _ae_ArrayElement(GLint elt)
 
 
 void
-_ae_invalidate_state(struct gl_context *ctx, GLuint new_state)
+_ae_invalidate_state(struct gl_context *ctx, GLbitfield new_state)
 {
    AEcontext *actx = AE_CONTEXT(ctx);
 
diff --git a/src/mesa/main/api_arrayelt.h b/src/mesa/main/api_arrayelt.h
index 39fdeb9d2bd..03cd9ecbd51 100644
--- a/src/mesa/main/api_arrayelt.h
+++ b/src/mesa/main/api_arrayelt.h
@@ -33,7 +33,7 @@
 
 extern GLboolean _ae_create_context( struct gl_context *ctx );
 extern void _ae_destroy_context( struct gl_context *ctx );
-extern void _ae_invalidate_state( struct gl_context *ctx, GLuint new_state );
+extern void _ae_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
 extern void GLAPIENTRY _ae_ArrayElement( GLint elt );
 
 /* May optionally be called before a batch of element calls:

From c032ae85ee1581870a34f5faad76e5b7ddaf4090 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 17 Dec 2015 14:06:11 -0700
Subject: [PATCH 136/241] st/mesa: move mipmap allocation check logic into a
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Better readability and easier to extend.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_cb_texture.c | 54 ++++++++++++++++++++------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 62f149aa0fb..867d4daad68 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -387,6 +387,43 @@ guess_base_level_size(GLenum target,
 }
 
 
+/**
+ * Try to determine whether we should allocate memory for a full texture
+ * mipmap.  The problem is when we get a glTexImage(level=0) call, we
+ * can't immediately know if other mipmap levels are coming next.  Here
+ * we try to guess whether to allocate memory for a mipmap or just the
+ * 0th level.
+ *
+ * If we guess incorrectly here we'll later reallocate the right amount of
+ * memory either in st_AllocTextureImageBuffer() or st_finalize_texture().
+ *
+ * \param stObj  the texture object we're going to allocate memory for.
+ * \param stImage  describes the incoming image which we need to store.
+ */
+static boolean
+allocate_full_mipmap(const struct st_texture_object *stObj,
+                     const struct st_texture_image *stImage)
+{
+   if (stImage->base.Level > 0 || stObj->base.GenerateMipmap)
+      return TRUE;
+
+   if (stImage->base._BaseFormat == GL_DEPTH_COMPONENT ||
+       stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT)
+      /* depth/stencil textures are seldom mipmapped */
+      return FALSE;
+
+   if (stObj->base.BaseLevel == 0 && stObj->base.MaxLevel == 0)
+      return FALSE;
+
+   if (stObj->base.Sampler.MinFilter == GL_NEAREST ||
+       stObj->base.Sampler.MinFilter == GL_LINEAR)
+      /* not a mipmap minification filter */
+      return FALSE;
+
+   return TRUE;
+}
+
+
 /**
  * Try to allocate a pipe_resource object for the given st_texture_object.
  *
@@ -431,22 +468,15 @@ guess_and_alloc_texture(struct st_context *st,
     * to re-allocating a texture buffer with space for more (or fewer)
     * mipmap levels later.
     */
-   if ((stObj->base.Sampler.MinFilter == GL_NEAREST ||
-        stObj->base.Sampler.MinFilter == GL_LINEAR ||
-        (stObj->base.BaseLevel == 0 &&
-         stObj->base.MaxLevel == 0) ||
-        stImage->base._BaseFormat == GL_DEPTH_COMPONENT ||
-        stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT) &&
-       !stObj->base.GenerateMipmap &&
-       stImage->base.Level == 0) {
-      /* only alloc space for a single mipmap level */
-      lastLevel = 0;
-   }
-   else {
+   if (allocate_full_mipmap(stObj, stImage)) {
       /* alloc space for a full mipmap */
       lastLevel = _mesa_get_tex_max_num_levels(stObj->base.Target,
                                                width, height, depth) - 1;
    }
+   else {
+      /* only alloc space for a single mipmap level */
+      lastLevel = 0;
+   }
 
    /* Save the level=0 dimensions */
    stObj->width0 = width;

From 18038b9fd6792be794ae1be80e006542be602b2a Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 17 Dec 2015 14:16:24 -0700
Subject: [PATCH 137/241] st/mesa: check texture target in
 allocate_full_mipmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some kinds of textures never have mipmaps.  3D textures seldom have
mipmaps.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_cb_texture.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 867d4daad68..f8b367989e7 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -404,6 +404,16 @@ static boolean
 allocate_full_mipmap(const struct st_texture_object *stObj,
                      const struct st_texture_image *stImage)
 {
+   switch (stObj->base.Target) {
+   case GL_TEXTURE_RECTANGLE_NV:
+   case GL_TEXTURE_BUFFER:
+   case GL_TEXTURE_EXTERNAL_OES:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      /* these texture types cannot be mipmapped */
+      return FALSE;
+   }
+
    if (stImage->base.Level > 0 || stObj->base.GenerateMipmap)
       return TRUE;
 
@@ -420,6 +430,10 @@ allocate_full_mipmap(const struct st_texture_object *stObj,
       /* not a mipmap minification filter */
       return FALSE;
 
+   if (stObj->base.Target == GL_TEXTURE_3D)
+      /* 3D textures are seldom mipmapped */
+      return FALSE;
+
    return TRUE;
 }
 

From 85444ab08b8bd0f291101acf42620ffbaa8c77fb Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 6 Jan 2016 11:48:52 -0700
Subject: [PATCH 138/241] st/mesa: replace bitmap size checks with assertion

The _mesa_Bitmap() caller already checks for zero-sized bitmaps.
---
 src/mesa/state_tracker/st_cb_bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 191f1443131..d8c3dbdd793 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -619,8 +619,8 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
    struct st_context *st = st_context(ctx);
    struct pipe_resource *pt;
 
-   if (width == 0 || height == 0)
-      return;
+   assert(width > 0);
+   assert(height > 0);
 
    /* We only need to validate state of the st dirty flags are set or
     * any non-_NEW_PROGRAM_CONSTANTS mesa flags are set.  The VS we use

From b59fad8478787665b7dc1618ca2a8b8df02feade Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 6 Jan 2016 15:45:08 -0700
Subject: [PATCH 139/241] st/mesa: minor clean-ups in st_atom.c

Remove useless comment.  Reformat code.
---
 src/mesa/state_tracker/st_atom.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 337213c1b80..03097225bb2 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -95,27 +95,26 @@ void st_destroy_atoms( struct st_context *st )
 }
 
 
-/***********************************************************************
- */
 
-static GLboolean check_state( const struct st_state_flags *a,
-			      const struct st_state_flags *b )
+static bool
+check_state(const struct st_state_flags *a, const struct st_state_flags *b)
 {
-   return ((a->mesa & b->mesa) ||
-	   (a->st & b->st));
+   return (a->mesa & b->mesa) || (a->st & b->st);
 }
 
-static void accumulate_state( struct st_state_flags *a,
-			      const struct st_state_flags *b )
+
+static void
+accumulate_state(struct st_state_flags *a, const struct st_state_flags *b)
 {
    a->mesa |= b->mesa;
    a->st |= b->st;
 }
 
 
-static void xor_states( struct st_state_flags *result,
-			     const struct st_state_flags *a,
-			      const struct st_state_flags *b )
+static void
+xor_states(struct st_state_flags *result,
+           const struct st_state_flags *a,
+           const struct st_state_flags *b)
 {
    result->mesa = a->mesa ^ b->mesa;
    result->st = a->st ^ b->st;
@@ -241,6 +240,3 @@ void st_validate_state( struct st_context *st )
 
    memset(state, 0, sizeof(*state));
 }
-
-
-

From b074a5b02de3dc0e2d0cbb6b9154673153b29525 Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Mon, 4 Jan 2016 10:36:48 -0800
Subject: [PATCH 140/241] svga: skip vertex attribute instruction with zero
 usage_mask

In emit_input_declarations(), we are skipping declarations for those
registers that are not being used. But in emit_vertex_attrib_instructions(),
we are still emitting instructions to tweak the vertex attributes even if
they are not being used. This causes an assert in the backend because an
input register is not declared in the shader. This patch fixes the problem
by skipping the instruction if the vertex attribute is not being used.
Changes in this patch is originated from the code snippet from Jose as
suggested in bug 1530161.

Tested with piglit, Heaven, Turbine, glretrace.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/svga/svga_tgsi_vgpu10.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index c979f4a8a56..c5be11f936e 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -6170,6 +6170,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
 
       while (adjust_mask) {
          unsigned index = u_bit_scan(&adjust_mask);
+
+         /* skip the instruction if this vertex attribute is not being used */
+         if (emit->info.input_usage_mask[index] == 0)
+            continue;
+
          unsigned tmp = emit->vs.adjusted_input[index];
          struct tgsi_full_src_register input_src =
             make_src_reg(TGSI_FILE_INPUT, index);

From 9ccc716534af17937a572330ba6608819e31997c Mon Sep 17 00:00:00 2001
From: Sinclair Yeh <syeh@vmware.com>
Date: Wed, 9 Dec 2015 15:05:49 -0800
Subject: [PATCH 141/241] svga: allow preemptive flushing on DMA, update, and
 readback commands

The existing code effectively turns off preemptive flushing for all
but the regions used for draws.  This turns out to be overly
restrictive as some memory regions, e.g. GMR, may never get a draw
when used as a DMA upload staging area, causing problems for apps
that upload a large amount of textures, e.g. Unigine Heaven.

This patch fixes the Unigine Heaven memory allocation error and
has been verified to not cause a regression in the previous extended
retina display issue.

Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/svga/svga_cmd.c                    | 7 +++++++
 src/gallium/drivers/svga/svga_resource_buffer_upload.c | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index f35b1371ebe..00250213320 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -473,6 +473,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
@@ -543,6 +544,7 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
@@ -1720,6 +1722,7 @@ SVGA3D_UpdateGBImage(struct svga_winsys_context *swc,
    cmd->box = *box;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
@@ -1746,6 +1749,7 @@ SVGA3D_UpdateGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
@@ -1775,6 +1779,7 @@ SVGA3D_ReadbackGBImage(struct svga_winsys_context *swc,
    cmd->image.mipmap = mipLevel;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
@@ -1801,6 +1806,7 @@ SVGA3D_ReadbackGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
@@ -1829,6 +1835,7 @@ SVGA3D_ReadbackGBImagePartial(struct svga_winsys_context *swc,
    cmd->invertBox = invertBox;
 
    swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
 
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index ba1a1f222b6..a26a88da8e8 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -308,6 +308,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    pipe_resource_reference(&dummy, &sbuf->b.b);
    SVGA_FIFOCommitAll(swc);
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
@@ -381,6 +382,7 @@ svga_buffer_upload_command(struct svga_context *svga,
 
    SVGA_FIFOCommitAll(swc);
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;

From 0819287f562fec991269e03c03e4a622e248930e Mon Sep 17 00:00:00 2001
From: Sinclair Yeh <syeh@vmware.com>
Date: Thu, 10 Dec 2015 14:26:29 -0800
Subject: [PATCH 142/241] svga: Rename SVGA_HINT_FLAG_DRAW_EMITTED

Rename SVGA_HINT_FLAG_DRAW_EMITTED to SVGA_HINT_FLAG_CAN_PRE_FLUSH
because preemptive flush can be unblocked by more commands than
draw.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/svga/svga_cmd.c              | 16 ++++++++--------
 src/gallium/drivers/svga/svga_cmd_vgpu10.c       | 10 +++++-----
 src/gallium/drivers/svga/svga_draw.c             |  2 +-
 .../drivers/svga/svga_resource_buffer_upload.c   |  4 ++--
 src/gallium/drivers/svga/svga_winsys.h           |  2 +-
 src/gallium/winsys/svga/drm/vmw_context.c        |  8 ++++----
 6 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 00250213320..e45b3e72aeb 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -473,7 +473,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -544,7 +544,7 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
    pSuffix->flags = flags;
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1018,7 +1018,7 @@ SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
    *decls = declArray;
    *ranges = rangeArray;
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1722,7 +1722,7 @@ SVGA3D_UpdateGBImage(struct svga_winsys_context *swc,
    cmd->box = *box;
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1749,7 +1749,7 @@ SVGA3D_UpdateGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1779,7 +1779,7 @@ SVGA3D_ReadbackGBImage(struct svga_winsys_context *swc,
    cmd->image.mipmap = mipLevel;
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1806,7 +1806,7 @@ SVGA3D_ReadbackGBSurface(struct svga_winsys_context *swc,
                            SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
@@ -1835,7 +1835,7 @@ SVGA3D_ReadbackGBImagePartial(struct svga_winsys_context *swc,
    cmd->invertBox = invertBox;
 
    swc->commit(swc);
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
 
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
index 5c121089f91..4cd9d5b9d1e 100644
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@@ -535,7 +535,7 @@ SVGA3D_vgpu10_Draw(struct svga_winsys_context *swc,
 
    SVGA3D_COPY_BASIC_2(vertexCount, startVertexLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -551,7 +551,7 @@ SVGA3D_vgpu10_DrawIndexed(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_3(indexCount, startIndexLocation,
                        baseVertexLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -568,7 +568,7 @@ SVGA3D_vgpu10_DrawInstanced(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_4(vertexCountPerInstance, instanceCount,
                        startVertexLocation, startInstanceLocation);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -588,7 +588,7 @@ SVGA3D_vgpu10_DrawIndexedInstanced(struct svga_winsys_context *swc,
                        startInstanceLocation);
 
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -598,7 +598,7 @@ SVGA3D_vgpu10_DrawAuto(struct svga_winsys_context *swc)
 {
    SVGA3D_CREATE_COMMAND(DrawAuto, DRAW_AUTO);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    swc->commit(swc);
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index d4c9914afbd..80526ed4d15 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -520,7 +520,7 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
          /* If we haven't yet emitted a drawing command or if any
           * vertex buffer state is changing, issue that state now.
           */
-         if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) == 0) ||
+         if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) == 0) ||
              vbuf_count != svga->state.hw_draw.num_vbuffers ||
              memcmp(buffers, svga->state.hw_draw.vbuffers,
                     vbuf_count * sizeof(buffers[0])) ||
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index a26a88da8e8..7f7ceab0aa5 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -308,7 +308,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    pipe_resource_reference(&dummy, &sbuf->b.b);
    SVGA_FIFOCommitAll(swc);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
@@ -382,7 +382,7 @@ svga_buffer_upload_command(struct svga_context *svga,
 
    SVGA_FIFOCommitAll(swc);
 
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 3129e46ed06..562c6690fc1 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -85,7 +85,7 @@ struct winsys_handle;
 #define SVGA_QUERY_FLAG_SET        (1 << 0)
 #define SVGA_QUERY_FLAG_REF        (1 << 1)
 
-#define SVGA_HINT_FLAG_DRAW_EMITTED (1 << 0)
+#define SVGA_HINT_FLAG_CAN_PRE_FLUSH (1 << 0)  /* Can preemptively flush */
 
 /** Opaque surface handle */
 struct svga_winsys_surface;
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index 4dc32366d61..dae121e4053 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -251,7 +251,7 @@ vmw_swc_flush(struct svga_winsys_context *swc,
    vswc->must_flush = FALSE;
    debug_flush_flush(vswc->fctx);
 #endif
-   swc->hints &= ~SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints &= ~SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    vswc->preemptive_flush = FALSE;
    vswc->seen_surfaces = 0;
    vswc->seen_regions = 0;
@@ -373,7 +373,7 @@ vmw_swc_region_relocation(struct svga_winsys_context *swc,
 
    if (vmw_swc_add_validate_buffer(vswc, reloc->buffer, flags)) {
       vswc->seen_regions += reloc->buffer->size;
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
           vswc->seen_regions >= VMW_GMR_POOL_SIZE/5)
          vswc->preemptive_flush = TRUE;
    }
@@ -416,7 +416,7 @@ vmw_swc_mob_relocation(struct svga_winsys_context *swc,
    if (vmw_swc_add_validate_buffer(vswc, pb_buffer, flags)) {
       vswc->seen_mobs += pb_buffer->size;
 
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
           vswc->seen_mobs >=
             vswc->vws->ioctl.max_mob_memory / VMW_MAX_MOB_MEM_FACTOR)
          vswc->preemptive_flush = TRUE;
@@ -479,7 +479,7 @@ vmw_swc_surface_only_relocation(struct svga_winsys_context *swc,
       ++vswc->surface.staged;
 
       vswc->seen_surfaces += vsurf->size;
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
           vswc->seen_surfaces >=
             vswc->vws->ioctl.max_surface_memory / VMW_MAX_SURF_MEM_FACTOR)
          vswc->preemptive_flush = TRUE;

From 39c41be50d9474dde4c0dcf23a546d14b212e80a Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 21 Sep 2015 14:22:53 -0700
Subject: [PATCH 143/241] mesa: Add KBL PCI IDs and platform information.

Add PCI IDs for the Intel Kabylake platforms.  The IDs are taken
directly from the Linux kernel patches, which are under review:

http://lists.freedesktop.org/archives/intel-gfx/2015-October/078967.html
http://cgit.freedesktop.org/~vivijim/drm-intel/log/?h=kbl-upstream-v2

The Kabylake PCI IDs taken from the kernel are rearranged to be in order
of GT type, then PCI ID.

Please note that if this patch is backported, the following fixes will
need to be added before this patch:

commit 28ed1e08e8ba98e "i965/skl: Remove early platform support"
commit c1e38ad37042b0e "i965/skl: Use larger URB size where available."

Thanks to Ben for fixing a bug around setting urb.size, and being
patient with my questions about what the various fields mean.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Suggested-by: Ben Widawsky <benjamin.widawsky@intel.com>
Tested-by: Rodrigo Vivi <rodrigo.vivi@intel.com> (KBL-GT2)
Cc: "11.1" <mesa-stable@lists.freedesktop.org>
---
 include/pci_ids/i965_pci_ids.h              | 22 ++++++++
 src/mesa/drivers/dri/i965/brw_device_info.c | 60 +++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 5891ba67ea4..5139e279bcc 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -132,6 +132,28 @@ CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x5902, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5906, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590A, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590B, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590E, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5913, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5915, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5917, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5912, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5916, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591A, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591B, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591D, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591E, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5921, kbl_gt2, "Intel(R) Kabylake GT2F")
+CHIPSET(0x5926, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592A, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592B, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x5932, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593A, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593D, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 4bfc83186bb..42bcb98f80e 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -402,6 +402,66 @@ static const struct brw_device_info brw_device_info_bxt = {
    }
 };
 
+/*
+ * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
+ * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
+ */
+
+/*
+ * Both SKL and KBL support a maximum of 64 threads per
+ * Pixel Shader Dispatch (PSD) unit.
+ */
+#define  KBL_MAX_THREADS_PER_PSD 64
+
+static const struct brw_device_info brw_device_info_kbl_gt1 = {
+   GEN9_FEATURES,
+   .gt = 1,
+
+   .max_cs_threads = 7 * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
+   .urb.size = 192,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt1_5 = {
+   GEN9_FEATURES,
+   .gt = 1,
+
+   .max_cs_threads = 7 * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt2 = {
+   GEN9_FEATURES,
+   .gt = 2,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt3 = {
+   GEN9_FEATURES,
+   .gt = 3,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt4 = {
+   GEN9_FEATURES,
+   .gt = 4,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
+   /*
+    * From the "L3 Allocation and Programming" documentation:
+    *
+    * "URB is limited to 1008KB due to programming restrictions.  This
+    *  is not a restriction of the L3 implementation, but of the FF and
+    *  other clients.  Therefore, in a GT4 implementation it is
+    *  possible for the programmed allocation of the L3 data array to
+    *  provide 3*384KB=1152KB for URB, but only 1008KB of this
+    *  will be used."
+    */
+   .urb.size = 1008 / 3,
+};
+
 const struct brw_device_info *
 brw_get_device_info(int devid)
 {

From afa035031ff9e0c07a2297d864e46c76f7bfff58 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 7 Jan 2016 01:52:39 +0100
Subject: [PATCH 144/241] draw: rework handling of non-existing outputs in emit
 code

Previously the code would just redirect requests for attributes which
don't exist to use output 0. Rework this to output all zeros instead which
seems more useful - in particular some extensions like
ARB_fragment_layer_viewport require 0 in the fs even if it wasn't output by
previous stages. That way, drivers don't have to special case this depending
if the vs/gs outputs some attribute or not.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/auxiliary/draw/draw_pipe_vbuf.c | 52 ++++++++++++---------
 src/gallium/auxiliary/draw/draw_pt_emit.c   | 12 +++++
 src/gallium/auxiliary/draw/draw_vertex.h    |  5 +-
 3 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index f36706cee01..6df7149b531 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -74,9 +74,10 @@ struct vbuf_stage {
    unsigned max_indices;
    unsigned nr_indices;
 
-   /* Cache point size somewhere it's address won't change:
+   /* Cache point size somewhere its address won't change:
     */
    float point_size;
+   float zero4[4];
 
    struct translate_cache *cache;
 };
@@ -205,6 +206,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
    struct translate_key hw_key;
    unsigned dst_offset;
    unsigned i;
+   const struct vertex_info *vinfo;
 
    vbuf->render->set_primitive(vbuf->render, prim);
 
@@ -215,27 +217,33 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
     * state change.
     */
    vbuf->vinfo = vbuf->render->get_vertex_info(vbuf->render);
-   vbuf->vertex_size = vbuf->vinfo->size * sizeof(float);
+   vinfo = vbuf->vinfo;
+   vbuf->vertex_size = vinfo->size * sizeof(float);
 
    /* Translate from pipeline vertices to hw vertices.
     */
    dst_offset = 0;
 
-   for (i = 0; i < vbuf->vinfo->num_attribs; i++) {
+   for (i = 0; i < vinfo->num_attribs; i++) {
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       enum pipe_format output_format;
-      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
-      output_format = draw_translate_vinfo_format(vbuf->vinfo->attrib[i].emit);
-      emit_sz = draw_translate_vinfo_size(vbuf->vinfo->attrib[i].emit);
+      output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+      emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
 
       /* doesn't handle EMIT_OMIT */
       assert(emit_sz != 0);
 
-      if (vbuf->vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
-	 src_buffer = 1;
-	 src_offset = 0;
+      if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
+         src_buffer = 1;
+         src_offset = 0;
+      }
+      else if (vinfo->attrib[i].src_index == DRAW_ATTR_NONEXIST) {
+         /* elements which don't exist will get assigned zeros */
+         src_buffer = 2;
+         src_offset = 0;
       }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
@@ -249,7 +257,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
       dst_offset += emit_sz;
    }
 
-   hw_key.nr_elements = vbuf->vinfo->num_attribs;
+   hw_key.nr_elements = vinfo->num_attribs;
    hw_key.output_stride = vbuf->vertex_size;
 
    /* Don't bother with caching at this stage:
@@ -261,6 +269,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
       vbuf->translate = translate_cache_find(vbuf->cache, &hw_key);
 
       vbuf->translate->set_buffer(vbuf->translate, 1, &vbuf->point_size, 0, ~0);
+      vbuf->translate->set_buffer(vbuf->translate, 2, &vbuf->zero4[0], 0, ~0);
    }
 
    vbuf->point_size = vbuf->stage.draw->rasterizer->point_size;
@@ -428,7 +437,7 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
    if (!vbuf)
       goto fail;
-   
+
    vbuf->stage.draw = draw;
    vbuf->stage.name = "vbuf";
    vbuf->stage.point = vbuf_first_point;
@@ -437,29 +446,30 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->stage.flush = vbuf_flush;
    vbuf->stage.reset_stipple_counter = vbuf_reset_stipple_counter;
    vbuf->stage.destroy = vbuf_destroy;
-   
+
    vbuf->render = render;
    vbuf->max_indices = MIN2(render->max_indices, UNDEFINED_VERTEX_ID-1);
 
-   vbuf->indices = (ushort *) align_malloc( vbuf->max_indices * 
-					    sizeof(vbuf->indices[0]), 
-					    16 );
+   vbuf->indices = (ushort *) align_malloc(vbuf->max_indices *
+                    sizeof(vbuf->indices[0]),
+                    16);
    if (!vbuf->indices)
       goto fail;
 
    vbuf->cache = translate_cache_create();
-   if (!vbuf->cache) 
+   if (!vbuf->cache)
       goto fail;
-      
-   
+
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
-   
+
+   vbuf->zero4[0] = vbuf->zero4[1] = vbuf->zero4[2] = vbuf->zero4[3] = 0.0f;
+
    return &vbuf->stage;
 
- fail:
+fail:
    if (vbuf)
       vbuf_destroy(&vbuf->stage);
-   
+
    return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 0b9fab5721c..6fb630b5498 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -44,6 +44,9 @@ struct pt_emit {
    unsigned prim;
 
    const struct vertex_info *vinfo;
+
+   float zero4[4];
+
 };
 
 
@@ -92,6 +95,11 @@ draw_pt_emit_prepare(struct pt_emit *emit,
          src_buffer = 1;
          src_offset = 0;
       }
+      else if (vinfo->attrib[i].src_index == DRAW_ATTR_NONEXIST) {
+         /* elements which don't exist will get assigned zeros */
+         src_buffer = 2;
+         src_offset = 0;
+      }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
@@ -111,6 +119,8 @@ draw_pt_emit_prepare(struct pt_emit *emit,
        translate_key_compare(&emit->translate->key, &hw_key) != 0) {
       translate_key_sanitize(&hw_key);
       emit->translate = translate_cache_find(emit->cache, &hw_key);
+
+      emit->translate->set_buffer(emit->translate, 2, &emit->zero4[0], 0, ~0);
    }
 
    if (!vinfo->size)
@@ -287,6 +297,8 @@ draw_pt_emit_create(struct draw_context *draw)
       return NULL;
    }
 
+   emit->zero4[0] = emit->zero4[1] = emit->zero4[2] = emit->zero4[3] = 0.0f;
+
    return emit;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index ee11d2f9276..ae207e2b0b3 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -44,6 +44,7 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 
+#define DRAW_ATTR_NONEXIST 255
 
 /**
  * Vertex attribute emit modes
@@ -130,9 +131,9 @@ draw_emit_vertex_attr(struct vertex_info *vinfo,
    const uint n = vinfo->num_attribs;
 
    /* If the src_index is negative, meaning it hasn't been found
-    * lets just redirect it to the first output slot */
+    * we'll assign it all zeros later - set to DRAW_ATTR_NONEXIST */
    if (src_index < 0) {
-      src_index = 0;
+      src_index = DRAW_ATTR_NONEXIST;
    }
 
    assert(n < Elements(vinfo->attrib));

From 01761a38e8f49b528facf9853c27bbc8891a4424 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Fri, 18 Dec 2015 21:44:06 +0100
Subject: [PATCH 145/241] llvmpipe: scratch some special handling of
 vp_index/layer

It was actually slightly buggy (missing initialization / setup not dependent
on new vs albeit I didn't see issues), but the case of non-existing attributes
is now handled by draw emit code so don't need that anymore.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/drivers/llvmpipe/lp_bld_interp.h  |  3 +--
 src/gallium/drivers/llvmpipe/lp_context.h     |  6 ------
 .../drivers/llvmpipe/lp_state_derived.c       | 21 +++++--------------
 src/gallium/drivers/llvmpipe/lp_state_setup.c | 15 +------------
 4 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 0a52642e395..9029d2a4180 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -63,8 +63,7 @@ enum lp_interp {
    LP_INTERP_LINEAR,
    LP_INTERP_PERSPECTIVE,
    LP_INTERP_POSITION,
-   LP_INTERP_FACING,
-   LP_INTERP_ZERO
+   LP_INTERP_FACING
 };
 
 struct lp_shader_input {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 9dcc102e758..b1cb10250bc 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -125,12 +125,6 @@ struct llvmpipe_context {
    /** A fake frontface output for unfilled primitives */
    uint8_t face_slot;
 
-   /** Which output slot is used for the fake vp index info */
-   uint8_t fake_vpindex_slot;
-
-   /** Which output slot is used for the fake layer info */
-   uint8_t fake_layer_slot;
-
    /** Depth format and bias settings. */
    boolean floating_point_depth;
    double mrd;   /**< minimum resolvable depth value, for polygon offset */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index f5bcfb2b511..f1f51cf0381 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -100,27 +100,16 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       /*
        * For vp index and layer, if the fs requires them but the vs doesn't
-       * provide them, store the slot - we'll later replace the data directly
-       * with zero (as required by ARB_fragment_layer_viewport). This is
-       * because draw itself just redirects them to whatever was at output 0.
-       * We'll also store the real vpindex/layer slot for setup use.
+       * provide them, draw (vbuf) will give us the required 0 (slot -1).
+       * (This means in this case we'll also use those slots in setup, which
+       * isn't necessary but they'll contain the correct (0) value.)
        */
       } else if (lpfs->info.base.input_semantic_name[i] ==
                  TGSI_SEMANTIC_VIEWPORT_INDEX) {
-         if (vs_index >= 0) {
-            llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_vpindex_slot = vinfo->num_attribs;
-         }
+         llvmpipe->viewport_index_slot = vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
-         if (vs_index >= 0) {
-            llvmpipe->layer_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_layer_slot = vinfo->num_attribs;
-         }
+         llvmpipe->layer_slot = vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       } else {
          /*
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index d7ba5c8ad8e..20e177f54fb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -602,13 +602,6 @@ emit_tri_coef( struct gallivm_state *gallivm,
           */
          break;
 
-      case LP_INTERP_ZERO:
-         /*
-          * The information we get from the output is bogus, replace it
-          * with zero.
-          */
-         emit_constant_coef4(gallivm, args, slot+1, args->bld.zero);
-         break;
       case LP_INTERP_FACING:
          emit_facing_coef(gallivm, args, slot+1);
          break;
@@ -879,13 +872,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
    key->pad = 0;
    memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
    for (i = 0; i < key->num_inputs; i++) {
-      if (key->inputs[i].interp == LP_INTERP_CONSTANT) {
-         if (key->inputs[i].src_index == lp->fake_vpindex_slot ||
-             key->inputs[i].src_index == lp->fake_layer_slot) {
-            key->inputs[i].interp = LP_INTERP_ZERO;
-         }
-      }
-      else if (key->inputs[i].interp == LP_INTERP_COLOR) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
          if (lp->rasterizer->flatshade)
             key->inputs[i].interp = LP_INTERP_CONSTANT;
          else

From b64d008052a0111e3170169c4bed08693d94b220 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 19 Dec 2015 02:33:25 +0100
Subject: [PATCH 146/241] softpipe: fix mapping of "special" vs outputs

Unlike llvmpipe, softpipe always tells draw to emit the vertices as-is.
The two vertex layouts it calculates are a bit confusing, one which is just
used to tell draw to emit vertices as-is, and the other which has draw written
all over it but draw is completely unaware of and is used only to look up the
correct interpolation info later in setup.
Thus, the slots used are different to what llvmpipe does (I'm going to clean
up the confusing two layout stuff).

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 .../drivers/softpipe/sp_state_derived.c       | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 7e998af1325..56ecc3b6140 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -79,10 +79,14 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
        */
       vinfo_vbuf->num_attribs = 0;
       for (i = 0; i < num; i++) {
-	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
+         draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
       draw_compute_vertex_size(vinfo_vbuf);
 
+      softpipe->viewport_index_slot = 0;
+      softpipe->layer_slot = 0;
+      softpipe->psize_slot = 0;
+
       /*
        * Loop over fragment shader inputs, searching for the matching output
        * from the vertex shader.
@@ -128,10 +132,15 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
          src = draw_find_shader_output(softpipe->draw,
                                        fsInfo->input_semantic_name[i],
                                        fsInfo->input_semantic_index[i]);
-	 if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1)
-	   /* try and find a bcolor */
-	   src = draw_find_shader_output(softpipe->draw,
-					 TGSI_SEMANTIC_BCOLOR, fsInfo->input_semantic_index[i]);
+         if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1)
+            /*
+             * try and find a bcolor.
+             * Note that if there's both front and back color, draw will
+             * have copied back to front color already.
+             */
+            src = draw_find_shader_output(softpipe->draw,
+                                          TGSI_SEMANTIC_BCOLOR,
+                                          fsInfo->input_semantic_index[i]);
 
          draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
@@ -141,7 +150,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
                                          TGSI_SEMANTIC_PSIZE, 0);
 
       if (vs_index >= 0) {
-         softpipe->psize_slot = vinfo->num_attribs;
+         softpipe->psize_slot = vs_index;
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       }
 
@@ -150,10 +159,8 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
                                          TGSI_SEMANTIC_VIEWPORT_INDEX,
                                          0);
       if (vs_index >= 0) {
-         softpipe->viewport_index_slot = vinfo->num_attribs;
+         softpipe->viewport_index_slot = vs_index;
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->viewport_index_slot = 0;
       }
 
       /* Figure out if we need layer */
@@ -161,10 +168,8 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
                                          TGSI_SEMANTIC_LAYER,
                                          0);
       if (vs_index >= 0) {
-         softpipe->layer_slot = vinfo->num_attribs;
+         softpipe->layer_slot = vs_index;
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->layer_slot = 0;
       }
 
       draw_compute_vertex_size(vinfo);

From 892e2d1395375c6f904af5250371c8d2784c8762 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 19 Dec 2015 03:37:17 +0100
Subject: [PATCH 147/241] softpipe: don't abuse the draw vertex_info struct for
 something different

softpipe would calculate two "vertex layouts". The second one was however
just used for internal purposes, draw would know nothing about it even though
it looked exactly the same as the other one we tell draw about.
So, store that information separately as this was just confusing.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/drivers/softpipe/sp_context.h     |  3 +-
 src/gallium/drivers/softpipe/sp_setup.c       | 27 ++++++----
 src/gallium/drivers/softpipe/sp_setup.h       | 16 ++++--
 src/gallium/drivers/softpipe/sp_state.h       |  3 --
 .../drivers/softpipe/sp_state_derived.c       | 49 +++++++++----------
 5 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 8e5e24217a5..188cdeaf76f 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -37,6 +37,7 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
+#include "sp_setup.h"
 
 
 /** Do polygon stipple in the draw module? */
@@ -117,7 +118,7 @@ struct softpipe_context {
    unsigned const_buffer_size[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
 
    /** Vertex format */
-   struct vertex_info vertex_info;
+   struct sp_setup_info setup_info;
    struct vertex_info vertex_info_vbuf;
 
    /** Which vertex shader output slot contains point size */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index ac2d97825ce..28f163b4d8f 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -599,10 +599,12 @@ setup_tri_coefficients(struct setup_context *setup)
 {
    struct softpipe_context *softpipe = setup->softpipe;
    const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    float v[3];
 
+   assert(sinfo->valid);
+
    /* z and w are done by linear interpolation:
     */
    v[0] = setup->vmin[0][2];
@@ -618,13 +620,14 @@ setup_tri_coefficients(struct setup_context *setup)
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
+      switch (sinfo->attrib[fragSlot].interp) {
       case INTERP_CONSTANT:
-         for (j = 0; j < TGSI_NUM_CHANNELS; j++)
+         for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         }
          break;
       case INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
@@ -966,11 +969,13 @@ setup_line_coefficients(struct setup_context *setup,
 {
    struct softpipe_context *softpipe = setup->softpipe;
    const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    float area;
    float v[2];
 
+   assert(sinfo->valid);
+
    /* use setup->vmin, vmax to point to vertices */
    if (softpipe->rasterizer->flatshade_first)
       setup->vprovoke = v0;
@@ -1001,10 +1006,10 @@ setup_line_coefficients(struct setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
+      switch (sinfo->attrib[fragSlot].interp) {
       case INTERP_CONSTANT:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1236,7 +1241,7 @@ sp_setup_point(struct setup_context *setup,
    const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
    const float x = v0[0][0];  /* Note: data[0] is always position */
    const float y = v0[0][1];
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
    uint fragSlot;
    uint layer = 0;
    unsigned viewport_index = 0;
@@ -1245,6 +1250,8 @@ sp_setup_point(struct setup_context *setup,
    print_vertex(setup, v0);
 #endif
 
+   assert(sinfo->valid);
+
    if (setup->softpipe->no_rast || setup->softpipe->rasterizer->rasterizer_discard)
       return;
 
@@ -1285,10 +1292,10 @@ sp_setup_point(struct setup_context *setup,
    const_coeff(setup, &setup->posCoef, 0, 3);
 
    for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->attrib[fragSlot].interp_mode) {
+      switch (sinfo->attrib[fragSlot].interp) {
       case INTERP_CONSTANT:
          /* fall-through */
       case INTERP_LINEAR:
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index 191494acbb8..8bb50b98fec 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -30,11 +30,19 @@
 struct setup_context;
 struct softpipe_context;
 
+struct sp_setup_info {
+   unsigned valid;
+   struct {
+      unsigned interp:8;      /**< INTERP_X */
+      unsigned src_index:8;
+   } attrib[PIPE_MAX_SHADER_OUTPUTS];
+};
+
 void 
-sp_setup_tri( struct setup_context *setup,
-	   const float (*v0)[4],
-	   const float (*v1)[4],
-	   const float (*v2)[4] );
+sp_setup_tri(struct setup_context *setup,
+             const float (*v0)[4],
+             const float (*v1)[4],
+             const float (*v2)[4]);
 
 void
 sp_setup_line(struct setup_context *setup,
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index c35534c931d..16a2897f526 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -174,9 +174,6 @@ void
 softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
 
 
-struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe);
-
 struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 56ecc3b6140..3fb1daee2c1 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -48,7 +48,7 @@
 static void
 invalidate_vertex_layout(struct softpipe_context *softpipe)
 {
-   softpipe->vertex_info.num_attribs =  0;
+   softpipe->setup_info.valid =  0;
 }
 
 
@@ -57,17 +57,16 @@ invalidate_vertex_layout(struct softpipe_context *softpipe)
  * (simple float[][4]) used by the 'draw' module into vertices for
  * rasterization.
  *
- * This function validates the vertex layout and returns a pointer to a
- * vertex_info object.
+ * This function validates the vertex layout.
  */
-struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe)
+static void
+softpipe_compute_vertex_info(struct softpipe_context *softpipe)
 {
-   struct vertex_info *vinfo = &softpipe->vertex_info;
+   struct sp_setup_info *sinfo = &softpipe->setup_info;
    int vs_index;
 
-   if (vinfo->num_attribs == 0) {
-      /* compute vertex layout now */
+   if (sinfo->valid == 0) {
+      /* compute vertex layout for vbuf now */
       const struct tgsi_shader_info *fsInfo = &softpipe->fs_variant->info;
       struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
       const uint num = draw_num_shader_outputs(softpipe->draw);
@@ -91,7 +90,6 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
        * Loop over fragment shader inputs, searching for the matching output
        * from the vertex shader.
        */
-      vinfo->num_attribs = 0;
       for (i = 0; i < fsInfo->num_inputs; i++) {
          int src;
          enum interp_mode interp = INTERP_LINEAR;
@@ -142,7 +140,15 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
                                           TGSI_SEMANTIC_BCOLOR,
                                           fsInfo->input_semantic_index[i]);
 
-         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+         sinfo->attrib[i].interp = interp;
+         /*
+          * note src can be -1 if not found. Would need special handling,
+          * (as we don't tell draw anything about it) just force to 0.
+          * It's wrong either way but should be safer...
+          */
+         if (src < 0)
+            src = 0;
+         sinfo->attrib[i].src_index = src;
       }
 
       /* Figure out if we need pointsize as well. */
@@ -151,7 +157,6 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
 
       if (vs_index >= 0) {
          softpipe->psize_slot = vs_index;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       }
 
       /* Figure out if we need viewport index */
@@ -160,7 +165,6 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
                                          0);
       if (vs_index >= 0) {
          softpipe->viewport_index_slot = vs_index;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       }
 
       /* Figure out if we need layer */
@@ -169,34 +173,25 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
                                          0);
       if (vs_index >= 0) {
          softpipe->layer_slot = vs_index;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       }
-
-      draw_compute_vertex_size(vinfo);
+      softpipe->setup_info.valid = 1;
    }
 
-   return vinfo;
+   return;
 }
 
 
 /**
  * Called from vbuf module.
  *
- * Note that there's actually two different vertex layouts in softpipe.
- *
- * The normal one is computed in softpipe_get_vertex_info() above and is
- * used by the point/line/tri "setup" code.
- *
- * The other one (this one) is only used by the vbuf module (which is
- * not normally used by default but used in testing).  For the vbuf module,
- * we basically want to pass-through the draw module's vertex layout as-is.
- * When the softpipe vbuf code begins drawing, the normal vertex layout
- * will come into play again.
+ * Note the vertex layout used for vbuf is simply telling it to pass
+ * through everything as is. The mapping actually used for setup is
+ * stored separately (but calculated here too at the same time).
  */
 struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 {
-   (void) softpipe_get_vertex_info(softpipe);
+   softpipe_compute_vertex_info(softpipe);
    return &softpipe->vertex_info_vbuf;
 }
 

From 2dbc20e45689e09766552517a74e2270e49817b5 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 19 Dec 2015 03:43:14 +0100
Subject: [PATCH 148/241] draw: nuke the interp parameter from vertex_info

draw emit couldn't care less what the interpolation mode is...
This somehow looked like it would matter, all drivers more or less
dutifully filled that in correctly. But this is only used for emit,
if draw needs to know about interpolation mode (for clipping for instance)
it will get that information from the vs anyway.
softpipe actually used to depend on that interpolation parameter, as it
abused that structure quite a bit but no longer.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/auxiliary/draw/draw_vertex.h      | 17 +------------
 src/gallium/drivers/i915/i915_context.h       |  1 -
 src/gallium/drivers/i915/i915_state.c         |  1 -
 src/gallium/drivers/i915/i915_state_derived.c | 17 ++++++-------
 .../drivers/llvmpipe/lp_state_derived.c       | 25 +++++++++----------
 src/gallium/drivers/nouveau/nv30/nv30_draw.c  | 15 ++++++-----
 src/gallium/drivers/r300/r300_state_derived.c | 24 ++++++------------
 src/gallium/drivers/softpipe/sp_setup.c       | 25 +++++++++----------
 src/gallium/drivers/softpipe/sp_setup.h       | 13 +++++++++-
 .../drivers/softpipe/sp_state_derived.c       | 16 ++++++------
 src/gallium/drivers/svga/svga_swtnl_state.c   | 10 +++-----
 11 files changed, 72 insertions(+), 92 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index ae207e2b0b3..c7b1afe5dde 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -61,18 +61,6 @@ enum attrib_emit {
 };
 
 
-/**
- * Attribute interpolation mode
- */
-enum interp_mode {
-   INTERP_NONE,      /**< never interpolate vertex header info */
-   INTERP_POS,       /**< special case for frag position */
-   INTERP_CONSTANT,
-   INTERP_LINEAR,
-   INTERP_PERSPECTIVE
-};
-
-
 /**
  * Information about hardware/rasterization vertex layout.
  */
@@ -86,8 +74,7 @@ struct vertex_info
     * memcmp() comparisons.
     */
    struct {
-      unsigned interp_mode:4;      /**< INTERP_x */
-      unsigned emit:4;             /**< EMIT_x */
+      unsigned emit:8;             /**< EMIT_x */
       unsigned src_index:8;          /**< map to post-xform attribs */
    } attrib[PIPE_MAX_SHADER_OUTPUTS];
 };
@@ -125,7 +112,6 @@ draw_vinfo_copy( struct vertex_info *dst,
 static inline uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
                       enum attrib_emit emit, 
-                      enum interp_mode interp, /* only used by softpipe??? */
                       int src_index)
 {
    const uint n = vinfo->num_attribs;
@@ -138,7 +124,6 @@ draw_emit_vertex_attr(struct vertex_info *vinfo,
 
    assert(n < Elements(vinfo->attrib));
    vinfo->attrib[n].emit = emit;
-   vinfo->attrib[n].interp_mode = interp;
    vinfo->attrib[n].src_index = src_index;
    vinfo->num_attribs++;
    return n;
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 1ed685188db..2adaee30fb9 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -195,7 +195,6 @@ struct i915_rasterizer_state {
 
    unsigned light_twoside : 1;
    unsigned st;
-   enum interp_mode color_interp;
 
    unsigned LIS4;
    unsigned LIS7;
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index d1661fed3f7..b54a9fbf4f9 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -920,7 +920,6 @@ i915_create_rasterizer_state(struct pipe_context *pipe,
    struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
 
    cso->templ = *rasterizer;
-   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
    cso->light_twoside = rasterizer->light_twoside;
    cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
    cso->ds[1].f = rasterizer->offset_scale;
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 7ad88a1ce01..bd0f448f645 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -57,7 +57,6 @@ static uint find_mapping(const struct i915_fragment_shader* fs, int unit)
 static void calculate_vertex_layout(struct i915_context *i915)
 {
    const struct i915_fragment_shader *fs = i915->fs;
-   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
    struct vertex_info vinfo;
    boolean texCoords[I915_TEX_UNITS], colors[2], fog, needW, face;
    uint i;
@@ -107,12 +106,12 @@ static void calculate_vertex_layout(struct i915_context *i915)
    /* pos */
    src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
    if (needW) {
-      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
       vinfo.attrib[0].emit = EMIT_4F;
    }
    else {
-      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZ;
       vinfo.attrib[0].emit = EMIT_3F;
    }
@@ -123,21 +122,21 @@ static void calculate_vertex_layout(struct i915_context *i915)
    /* primary color */
    if (colors[0]) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
       vinfo.hwfmt[0] |= S4_VFMT_COLOR;
    }
 
    /* secondary color */
    if (colors[1]) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
       vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
    }
 
    /* fog coord, not fog blend factor */
    if (fog) {
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
       vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
    }
 
@@ -147,7 +146,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
       if (texCoords[i]) {
          hwtc = TEXCOORDFMT_4D;
          src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, fs->generic_mapping[i]);
-         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       }
       else {
          hwtc = TEXCOORDFMT_NOT_PRESENT;
@@ -164,7 +163,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
        * module by adding an extra shader output.
        */
       src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FACE, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_CONSTANT, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
       vinfo.hwfmt[1] &= ~(TEXCOORDFMT_NOT_PRESENT << (slot * 4));
       vinfo.hwfmt[1] |= TEXCOORDFMT_1D << (slot * 4);
    }
@@ -185,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
 struct i915_tracked_state i915_update_vertex_layout = {
    "vertex_layout",
    calculate_vertex_layout,
-   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
+   I915_NEW_FS | I915_NEW_VS
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index f1f51cf0381..fbc2e185343 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -76,7 +76,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                       TGSI_SEMANTIC_POSITION,
                                       0);
 
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
 
    for (i = 0; i < lpfs->info.base.num_inputs; i++) {
       /*
@@ -95,9 +95,9 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
       if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
          llvmpipe->face_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       /*
        * For vp index and layer, if the fs requires them but the vs doesn't
        * provide them, draw (vbuf) will give us the required 0 (slot -1).
@@ -107,15 +107,15 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       } else if (lpfs->info.base.input_semantic_name[i] ==
                  TGSI_SEMANTIC_VIEWPORT_INDEX) {
          llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
          llvmpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       } else {
          /*
           * Emit the requested fs attribute for all but position.
           */
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -127,7 +127,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
       if (vs_index >= 0) {
          llvmpipe->bcolor_slot[i] = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -138,7 +138,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
    if (vs_index >= 0) {
       llvmpipe->psize_slot = vinfo->num_attribs;
-      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
    }
 
    /* Figure out if we need viewport index (if it wasn't already in fs input) */
@@ -148,7 +148,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                          0);
       if (vs_index >= 0) {
          llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -159,7 +159,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                          0);
       if (vs_index >= 0) {
          llvmpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
@@ -186,10 +186,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
    }
       
-   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
-                          LP_NEW_FS |
+   if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_VS))
-      compute_vertex_info( llvmpipe );
+      compute_vertex_info(llvmpipe);
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_FRAMEBUFFER |
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 098d6e499fa..7b0d0745766 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -208,17 +208,16 @@ nv30_render_release_vertices(struct vbuf_render *render)
 
 static const struct {
    unsigned emit;
-   unsigned interp;
    unsigned vp30;
    unsigned vp40;
    unsigned ow40;
 } vroute [] = {
-   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, INTERP_PERSPECTIVE, 0, 0, 0x00000000 },
-   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, INTERP_LINEAR     , 3, 1, 0x00000001 },
-   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
-   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
-   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
-   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
+   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, 0, 0, 0x00000000 },
+   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, 3, 1, 0x00000001 },
+   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, 1, 3, 0x00000004 },
+   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, 5, 5, 0x00000010 },
+   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, 6, 6, 0x00000020 },
+   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, 8, 7, 0x00004000 },
 };
 
 static bool
@@ -247,7 +246,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    if (emit == EMIT_OMIT)
       return false;
 
-   draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
+   draw_emit_vertex_attr(vinfo, emit, attrib);
    format = draw_translate_vinfo_format(emit);
 
    r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw;
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index da472f4d7f4..741e263e7ed 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -52,7 +52,6 @@ enum r300_rs_col_write_type {
 
 static void r300_draw_emit_attrib(struct r300_context* r300,
                                   enum attrib_emit emit,
-                                  enum interp_mode interp,
                                   int index)
 {
     struct r300_vertex_shader* vs = r300->vs_state.state;
@@ -62,7 +61,7 @@ static void r300_draw_emit_attrib(struct r300_context* r300,
     output = draw_find_shader_output(r300->draw,
                                      info->output_semantic_name[index],
                                      info->output_semantic_index[index]);
-    draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output);
+    draw_emit_vertex_attr(&r300->vertex_info, emit, output);
 }
 
 static void r300_draw_emit_all_attribs(struct r300_context* r300)
@@ -73,31 +72,27 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
 
     /* Position. */
     if (vs_outputs->pos != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->pos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->pos);
     } else {
         assert(0);
     }
 
     /* Point size. */
     if (vs_outputs->psize != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS,
-                              vs_outputs->psize);
+        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, vs_outputs->psize);
     }
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (vs_outputs->color[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->color[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->color[i]);
         }
     }
 
     /* Back-face colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->bcolor[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->bcolor[i]);
         }
     }
 
@@ -108,16 +103,14 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) {
         if (vs_outputs->generic[i] != ATTR_UNUSED &&
             !(r300->sprite_coord_enable & (1 << i))) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                                  vs_outputs->generic[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->generic[i]);
             gen_count++;
         }
     }
 
     /* Fog coordinates. */
     if (gen_count < 8 && vs_outputs->fog != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->fog);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->fog);
         gen_count++;
     }
 
@@ -125,8 +118,7 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     if (r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED && gen_count < 8) {
         DBG(r300, DBG_SWTCL, "draw_emit_attrib: WPOS, index: %i\n",
             vs_outputs->wpos);
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->wpos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->wpos);
     }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 28f163b4d8f..ffe49260b9a 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -38,7 +38,6 @@
 #include "sp_setup.h"
 #include "sp_state.h"
 #include "draw/draw_context.h"
-#include "draw/draw_vertex.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -624,12 +623,12 @@ setup_tri_coefficients(struct setup_context *setup)
       uint j;
 
       switch (sinfo->attrib[fragSlot].interp) {
-      case INTERP_CONSTANT:
+      case SP_INTERP_CONSTANT:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          }
          break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                        setup->vmid[vertSlot][j],
@@ -639,7 +638,7 @@ setup_tri_coefficients(struct setup_context *setup)
             tri_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                        setup->vmid[vertSlot][j],
@@ -649,7 +648,7 @@ setup_tri_coefficients(struct setup_context *setup)
             tri_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
@@ -1010,11 +1009,11 @@ setup_line_coefficients(struct setup_context *setup,
       uint j;
 
       switch (sinfo->attrib[fragSlot].interp) {
-      case INTERP_CONSTANT:
+      case SP_INTERP_CONSTANT:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmax[vertSlot][j],
@@ -1023,7 +1022,7 @@ setup_line_coefficients(struct setup_context *setup,
             line_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
             line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmax[vertSlot][j],
@@ -1032,7 +1031,7 @@ setup_line_coefficients(struct setup_context *setup,
             line_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
@@ -1296,18 +1295,18 @@ sp_setup_point(struct setup_context *setup,
       uint j;
 
       switch (sinfo->attrib[fragSlot].interp) {
-      case INTERP_CONSTANT:
+      case SP_INTERP_CONSTANT:
          /* fall-through */
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++)
             point_persp_coeff(setup, setup->vprovoke,
                               &setup->coef[fragSlot], vertSlot, j);
          break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
          break;
       default:
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index 8bb50b98fec..9efae1cb5ed 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -30,10 +30,21 @@
 struct setup_context;
 struct softpipe_context;
 
+/**
+ * Attribute interpolation mode
+ */
+enum sp_interp_mode {
+   SP_INTERP_POS,       /**< special case for frag position */
+   SP_INTERP_CONSTANT,
+   SP_INTERP_LINEAR,
+   SP_INTERP_PERSPECTIVE
+};
+
+
 struct sp_setup_info {
    unsigned valid;
    struct {
-      unsigned interp:8;      /**< INTERP_X */
+      unsigned interp:8;      /**< SP_INTERP_X */
       unsigned src_index:8;
    } attrib[PIPE_MAX_SHADER_OUTPUTS];
 };
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 3fb1daee2c1..ca29d76f8c2 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -78,7 +78,7 @@ softpipe_compute_vertex_info(struct softpipe_context *softpipe)
        */
       vinfo_vbuf->num_attribs = 0;
       for (i = 0; i < num; i++) {
-         draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
+         draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, i);
       }
       draw_compute_vertex_size(vinfo_vbuf);
 
@@ -92,17 +92,17 @@ softpipe_compute_vertex_info(struct softpipe_context *softpipe)
        */
       for (i = 0; i < fsInfo->num_inputs; i++) {
          int src;
-         enum interp_mode interp = INTERP_LINEAR;
+         enum sp_interp_mode interp = SP_INTERP_LINEAR;
 
          switch (fsInfo->input_interpolate[i]) {
          case TGSI_INTERPOLATE_CONSTANT:
-            interp = INTERP_CONSTANT;
+            interp = SP_INTERP_CONSTANT;
             break;
          case TGSI_INTERPOLATE_LINEAR:
-            interp = INTERP_LINEAR;
+            interp = SP_INTERP_LINEAR;
             break;
          case TGSI_INTERPOLATE_PERSPECTIVE:
-            interp = INTERP_PERSPECTIVE;
+            interp = SP_INTERP_PERSPECTIVE;
             break;
          case TGSI_INTERPOLATE_COLOR:
             assert(fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR);
@@ -113,15 +113,15 @@ softpipe_compute_vertex_info(struct softpipe_context *softpipe)
 
          switch (fsInfo->input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
-            interp = INTERP_POS;
+            interp = SP_INTERP_POS;
             break;
 
          case TGSI_SEMANTIC_COLOR:
             if (fsInfo->input_interpolate[i] == TGSI_INTERPOLATE_COLOR) {
                if (softpipe->rasterizer->flatshade)
-                  interp = INTERP_CONSTANT;
+                  interp = SP_INTERP_CONSTANT;
                else
-                  interp = INTERP_PERSPECTIVE;
+                  interp = SP_INTERP_PERSPECTIVE;
             }
             break;
          }
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index 79dc0bf580c..4d21f4f0e60 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -220,8 +220,6 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
    struct draw_context *draw = svga->swtnl.draw;
    struct vertex_info *vinfo = &svga_render->vertex_info;
    SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
-   const enum interp_mode colorInterp =
-      svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
    struct svga_fragment_shader *fs = svga->curr.fs;
    int offset = 0;
    int nr_decls = 0;
@@ -236,7 +234,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
 
    /* always add position */
    src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, src);
    vinfo->attrib[0].emit = EMIT_4F;
    vdecl[0].array.offset = offset;
    vdecl[0].identity.method = SVGA3D_DECLMETHOD_DEFAULT;
@@ -257,14 +255,14 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
 
       switch (sem_name) {
       case TGSI_SEMANTIC_COLOR:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
          offset += 16;
          nr_decls++;
          break;
       case TGSI_SEMANTIC_GENERIC:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
          vdecl[nr_decls].identity.usageIndex =
@@ -273,7 +271,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
          nr_decls++;
          break;
       case TGSI_SEMANTIC_FOG:
-         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, src);
          vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
          vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1;
          assert(vdecl[nr_decls].identity.usageIndex == 0);

From 8e3a76791f208e67392b7b7a2e63eca32945ac7b Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 19 Dec 2015 06:12:27 +0100
Subject: [PATCH 149/241] llvmpipe: use ints not unsigned for slots

They can't actually be 0 (as position is there) but should avoid confusion.

This was supposed to have been done by af7ba989fb5a39925a0a1261ed281fe7f48a16cf
but I accidentally pushed an older version of the patch in the end...
Also prettify slightly. And make some notes about the confusing and useless
fs input "map".

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/drivers/llvmpipe/lp_context.h     | 12 +--
 .../drivers/llvmpipe/lp_setup_context.h       |  8 +-
 .../drivers/llvmpipe/lp_state_derived.c       | 73 ++++++++++---------
 src/gallium/drivers/llvmpipe/lp_state_fs.c    | 35 ++++-----
 src/gallium/drivers/llvmpipe/lp_state_setup.c |  4 +-
 src/gallium/drivers/llvmpipe/lp_state_setup.h |  8 +-
 6 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index b1cb10250bc..62d99bbaac8 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -108,22 +108,22 @@ struct llvmpipe_context {
    struct vertex_info vertex_info;
    
    /** Which vertex shader output slot contains color */
-   uint8_t color_slot[2];
+   int8_t color_slot[2];
 
    /** Which vertex shader output slot contains bcolor */
-   uint8_t bcolor_slot[2];
+   int8_t bcolor_slot[2];
 
    /** Which vertex shader output slot contains point size */
-   uint8_t psize_slot;
+   int8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   uint8_t viewport_index_slot;
+   int8_t viewport_index_slot;
 
    /** Which geometry shader output slot contains layer */
-   uint8_t layer_slot;
+   int8_t layer_slot;
 
    /** A fake frontface output for unfilled primitives */
-   uint8_t face_slot;
+   int8_t face_slot;
 
    /** Depth format and bias settings. */
    boolean floating_point_depth;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 4451284c303..80acd74bddd 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -105,10 +105,10 @@ struct lp_setup_context
    float pixel_offset;
    float line_width;
    float point_size;
-   uint8_t psize_slot;
-   uint8_t viewport_index_slot;
-   uint8_t layer_slot;
-   uint8_t face_slot;
+   int8_t psize_slot;
+   int8_t viewport_index_slot;
+   int8_t layer_slot;
+   int8_t face_slot;
 
    struct pipe_framebuffer_state fb;
    struct u_rect framebuffer;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index fbc2e185343..34961cbbac5 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -48,21 +48,26 @@
 static void
 compute_vertex_info(struct llvmpipe_context *llvmpipe)
 {
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const struct tgsi_shader_info *fsInfo = &llvmpipe->fs->info.base;
    struct vertex_info *vinfo = &llvmpipe->vertex_info;
    int vs_index;
    uint i;
 
    draw_prepare_shader_outputs(llvmpipe->draw);
 
-   llvmpipe->color_slot[0] = 0;
-   llvmpipe->color_slot[1] = 0;
-   llvmpipe->bcolor_slot[0] = 0;
-   llvmpipe->bcolor_slot[1] = 0;
-   llvmpipe->viewport_index_slot = 0;
-   llvmpipe->layer_slot = 0;
-   llvmpipe->face_slot = 0;
-   llvmpipe->psize_slot = 0;
+   /*
+    * Those can't actually be 0 (because pos is always at 0).
+    * But use ints anyway to avoid confusion (in vs outputs, they
+    * can very well be at pos 0).
+    */
+   llvmpipe->color_slot[0] = -1;
+   llvmpipe->color_slot[1] = -1;
+   llvmpipe->bcolor_slot[0] = -1;
+   llvmpipe->bcolor_slot[1] = -1;
+   llvmpipe->viewport_index_slot = -1;
+   llvmpipe->layer_slot = -1;
+   llvmpipe->face_slot = -1;
+   llvmpipe->psize_slot = -1;
 
    /*
     * Match FS inputs against VS outputs, emitting the necessary
@@ -73,30 +78,26 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
    vinfo->num_attribs = 0;
 
    vs_index = draw_find_shader_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_POSITION,
-                                      0);
+                                      TGSI_SEMANTIC_POSITION, 0);
 
    draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
 
-   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
+   for (i = 0; i < fsInfo->num_inputs; i++) {
       /*
        * Search for each input in current vs output:
        */
-
       vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.base.input_semantic_name[i],
-                                         lpfs->info.base.input_semantic_index[i]);
+                                         fsInfo->input_semantic_name[i],
+                                         fsInfo->input_semantic_index[i]);
 
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
-          lpfs->info.base.input_semantic_index[i] < 2) {
-         int idx = lpfs->info.base.input_semantic_index[i];
-         llvmpipe->color_slot[idx] = vinfo->num_attribs;
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+          fsInfo->input_semantic_index[i] < 2) {
+         int idx = fsInfo->input_semantic_index[i];
+         llvmpipe->color_slot[idx] = (int)vinfo->num_attribs;
       }
 
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
-         llvmpipe->face_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
+         llvmpipe->face_slot = (int)vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       /*
        * For vp index and layer, if the fs requires them but the vs doesn't
@@ -104,16 +105,20 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
        * (This means in this case we'll also use those slots in setup, which
        * isn't necessary but they'll contain the correct (0) value.)
        */
-      } else if (lpfs->info.base.input_semantic_name[i] ==
+      } else if (fsInfo->input_semantic_name[i] ==
                  TGSI_SEMANTIC_VIEWPORT_INDEX) {
-         llvmpipe->viewport_index_slot = vinfo->num_attribs;
+         llvmpipe->viewport_index_slot = (int)vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
-         llvmpipe->layer_slot = vinfo->num_attribs;
+      } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       } else {
          /*
-          * Emit the requested fs attribute for all but position.
+          * Note that we'd actually want to skip position (as we won't use
+          * the attribute in the fs) but can't. The reason is that we don't
+          * actually have a input/output map for setup (even though it looks
+          * like we do...). Could adjust for this though even without a map
+          * (in llvmpipe_create_fs_state()).
           */
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
@@ -126,7 +131,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                          TGSI_SEMANTIC_BCOLOR, i);
 
       if (vs_index >= 0) {
-         llvmpipe->bcolor_slot[i] = vinfo->num_attribs;
+         llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
@@ -137,28 +142,28 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                       TGSI_SEMANTIC_PSIZE, 0);
 
    if (vs_index >= 0) {
-      llvmpipe->psize_slot = vinfo->num_attribs;
+      llvmpipe->psize_slot = (int)vinfo->num_attribs;
       draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
    }
 
    /* Figure out if we need viewport index (if it wasn't already in fs input) */
-   if (llvmpipe->viewport_index_slot == 0) {
+   if (llvmpipe->viewport_index_slot < 0) {
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          TGSI_SEMANTIC_VIEWPORT_INDEX,
                                          0);
       if (vs_index >= 0) {
-         llvmpipe->viewport_index_slot = vinfo->num_attribs;
+         llvmpipe->viewport_index_slot =(int)vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
 
    /* Figure out if we need layer (if it wasn't already in fs input) */
-   if (llvmpipe->layer_slot == 0) {
+   if (llvmpipe->layer_slot < 0) {
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          TGSI_SEMANTIC_LAYER,
                                          0);
       if (vs_index >= 0) {
-         llvmpipe->layer_slot = vinfo->num_attribs;
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 079083e9601..83ff97659fb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -2695,34 +2695,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
 
       switch (shader->info.base.input_interpolate[i]) {
       case TGSI_INTERPOLATE_CONSTANT:
-	 shader->inputs[i].interp = LP_INTERP_CONSTANT;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
       case TGSI_INTERPOLATE_LINEAR:
-	 shader->inputs[i].interp = LP_INTERP_LINEAR;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_LINEAR;
+         break;
       case TGSI_INTERPOLATE_PERSPECTIVE:
-	 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+         break;
       case TGSI_INTERPOLATE_COLOR:
-	 shader->inputs[i].interp = LP_INTERP_COLOR;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
       default:
-	 assert(0);
-	 break;
+         assert(0);
+         break;
       }
 
       switch (shader->info.base.input_semantic_name[i]) {
       case TGSI_SEMANTIC_FACE:
-	 shader->inputs[i].interp = LP_INTERP_FACING;
-	 break;
+         shader->inputs[i].interp = LP_INTERP_FACING;
+         break;
       case TGSI_SEMANTIC_POSITION:
-	 /* Position was already emitted above
-	  */
-	 shader->inputs[i].interp = LP_INTERP_POSITION;
-	 shader->inputs[i].src_index = 0;
-	 continue;
+         /* Position was already emitted above
+          */
+         shader->inputs[i].interp = LP_INTERP_POSITION;
+         shader->inputs[i].src_index = 0;
+         continue;
       }
 
+      /* XXX this is a completely pointless index map... */
       shader->inputs[i].src_index = i+1;
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index 20e177f54fb..6a4fbbbf202 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm,
    /* Potentially modify it according to twoside, etc:
     */
    if (key->twoside) {
-      if (vert_attr == key->color_slot && key->bcolor_slot > 0)
+      if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
          lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
-      else if (vert_attr == key->spec_slot && key->bspec_slot > 0)
+      else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
          lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
    }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 6cee6fe5eb5..9ad244482de 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -17,10 +17,10 @@ struct lp_setup_variant_list_item
 struct lp_setup_variant_key {
    unsigned size:16;
    unsigned num_inputs:8;
-   unsigned color_slot:8;
-   unsigned bcolor_slot:8;
-   unsigned spec_slot:8;
-   unsigned bspec_slot:8;
+   int color_slot:8;
+   int bcolor_slot:8;
+   int spec_slot:8;
+   int bspec_slot:8;
    unsigned flatshade_first:1;
    unsigned pixel_center_half:1;
    unsigned twoside:1;

From 8d4039ecdb167771d4b085f70b9666438be1c6ad Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Tue, 22 Dec 2015 03:42:33 +0100
Subject: [PATCH 150/241] softpipe: tell draw about the vertex layout we want

This makes it more similar to llvmpipe. It also allows us to let draw emit
code handle things like getting zeros for non-existing vs outputs
automatically. There probably isn't really any overhead either way, there isn't
really any "simply copy everything" code in the emit path it would copy each
attrib individually just the same. Likewise, we still do another mapping step
in softpipe as the layout may still not match exactly (same as in llvmpipe,
should probably nuke the pointless mapping in both drivers).

This fixes the piglit arb_fragment_layer_viewport no_gs/no_write tests.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/drivers/softpipe/sp_context.h     |   8 +-
 src/gallium/drivers/softpipe/sp_prim_vbuf.c   |   4 +-
 src/gallium/drivers/softpipe/sp_setup.h       |   2 +-
 .../drivers/softpipe/sp_state_derived.c       | 153 ++++++++++++------
 4 files changed, 107 insertions(+), 60 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 188cdeaf76f..d5c4aaae638 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -119,16 +119,16 @@ struct softpipe_context {
 
    /** Vertex format */
    struct sp_setup_info setup_info;
-   struct vertex_info vertex_info_vbuf;
+   struct vertex_info vertex_info;
 
    /** Which vertex shader output slot contains point size */
-   int psize_slot;
+   int8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   int viewport_index_slot;
+   int8_t viewport_index_slot;
 
    /** Which vertex shader output slot contains layer */
-   int layer_slot;
+   int8_t layer_slot;
 
    /** The reduced version of the primitive supplied by the state tracker */
    unsigned reduced_api_prim;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index f8a3eacdb37..95d1ac1514f 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -161,7 +161,7 @@ sp_vbuf_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
    struct setup_context *setup = cvbr->setup;
    const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
@@ -358,7 +358,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
    struct setup_context *setup = cvbr->setup;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index 9efae1cb5ed..a54dc5dad0c 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -45,7 +45,7 @@ struct sp_setup_info {
    unsigned valid;
    struct {
       unsigned interp:8;      /**< SP_INTERP_X */
-      unsigned src_index:8;
+      int src_index:8;
    } attrib[PIPE_MAX_SHADER_OUTPUTS];
 };
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index ca29d76f8c2..d4d03f1be50 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -63,35 +63,47 @@ static void
 softpipe_compute_vertex_info(struct softpipe_context *softpipe)
 {
    struct sp_setup_info *sinfo = &softpipe->setup_info;
-   int vs_index;
 
    if (sinfo->valid == 0) {
-      /* compute vertex layout for vbuf now */
       const struct tgsi_shader_info *fsInfo = &softpipe->fs_variant->info;
-      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_num_shader_outputs(softpipe->draw);
+      struct vertex_info *vinfo = &softpipe->vertex_info;
       uint i;
-
-      /* Tell draw_vbuf to simply emit the whole post-xform vertex
-       * as-is.  No longer any need to try and emit draw vertex_header
-       * info.
+      int vs_index;
+      /*
+       * This doesn't quite work right (wrt face injection, prim id,
+       * wide points) - hit a couple assertions, misrenderings plus
+       * memory corruption. Albeit could fix (the former two) by calling
+       * this "more often" (rasterizer changes etc.). (The latter would
+       * need to be included in draw_prepare_shader_outputs, but it looks
+       * like that would potentially allocate quite some unused additional
+       * vertex outputs.)
+       * draw_prepare_shader_outputs(softpipe->draw);
        */
-      vinfo_vbuf->num_attribs = 0;
-      for (i = 0; i < num; i++) {
-         draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, i);
-      }
-      draw_compute_vertex_size(vinfo_vbuf);
-
-      softpipe->viewport_index_slot = 0;
-      softpipe->layer_slot = 0;
-      softpipe->psize_slot = 0;
 
       /*
-       * Loop over fragment shader inputs, searching for the matching output
-       * from the vertex shader.
+       * Those can't actually be 0 (because pos is always at 0).
+       * But use ints anyway to avoid confusion (in vs outputs, they
+       * can very well be at pos 0).
+       */
+      softpipe->viewport_index_slot = -1;
+      softpipe->layer_slot = -1;
+      softpipe->psize_slot = -1;
+
+      vinfo->num_attribs = 0;
+
+      /*
+       * Put position always first (setup needs it there).
+       */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_POSITION, 0);
+
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+
+      /*
+       * Match FS inputs against VS outputs, emitting the necessary
+       * attributes.
        */
       for (i = 0; i < fsInfo->num_inputs; i++) {
-         int src;
          enum sp_interp_mode interp = SP_INTERP_LINEAR;
 
          switch (fsInfo->input_interpolate[i]) {
@@ -126,57 +138,93 @@ softpipe_compute_vertex_info(struct softpipe_context *softpipe)
             break;
          }
 
-         /* this includes texcoords and varying vars */
-         src = draw_find_shader_output(softpipe->draw,
-                                       fsInfo->input_semantic_name[i],
-                                       fsInfo->input_semantic_index[i]);
-         if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1)
+         /*
+          * Search for each input in current vs output:
+          */
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            fsInfo->input_semantic_name[i],
+                                            fsInfo->input_semantic_index[i]);
+
+         if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+             vs_index == -1) {
             /*
              * try and find a bcolor.
              * Note that if there's both front and back color, draw will
              * have copied back to front color already.
              */
-            src = draw_find_shader_output(softpipe->draw,
-                                          TGSI_SEMANTIC_BCOLOR,
-                                          fsInfo->input_semantic_index[i]);
+            vs_index = draw_find_shader_output(softpipe->draw,
+                                               TGSI_SEMANTIC_BCOLOR,
+                                               fsInfo->input_semantic_index[i]);
+         }
 
          sinfo->attrib[i].interp = interp;
+         /* extremely pointless index map */
+         sinfo->attrib[i].src_index = i + 1;
          /*
-          * note src can be -1 if not found. Would need special handling,
-          * (as we don't tell draw anything about it) just force to 0.
-          * It's wrong either way but should be safer...
+          * For vp index and layer, if the fs requires them but the vs doesn't
+          * provide them, draw (vbuf) will give us the required 0 (slot -1).
+          * (This means in this case we'll also use those slots in setup, which
+          * isn't necessary but they'll contain the correct (0) value.)
           */
-         if (src < 0)
-            src = 0;
-         sinfo->attrib[i].src_index = src;
+         if (fsInfo->input_semantic_name[i] ==
+                    TGSI_SEMANTIC_VIEWPORT_INDEX) {
+            softpipe->viewport_index_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+         } else {
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
-      /* Figure out if we need pointsize as well. */
+      /* Figure out if we need pointsize as well.
+       */
       vs_index = draw_find_shader_output(softpipe->draw,
                                          TGSI_SEMANTIC_PSIZE, 0);
 
       if (vs_index >= 0) {
-         softpipe->psize_slot = vs_index;
+         softpipe->psize_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
       }
 
-      /* Figure out if we need viewport index */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_VIEWPORT_INDEX,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->viewport_index_slot = vs_index;
+      /* Figure out if we need viewport index (if it wasn't already in fs input) */
+      if (softpipe->viewport_index_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->viewport_index_slot =(int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
 
-      /* Figure out if we need layer */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_LAYER,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->layer_slot = vs_index;
+      /* Figure out if we need layer (if it wasn't already in fs input) */
+      if (softpipe->layer_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_LAYER,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
       }
+
+      draw_compute_vertex_size(vinfo);
       softpipe->setup_info.valid = 1;
    }
-
    return;
 }
 
@@ -184,15 +232,14 @@ softpipe_compute_vertex_info(struct softpipe_context *softpipe)
 /**
  * Called from vbuf module.
  *
- * Note the vertex layout used for vbuf is simply telling it to pass
- * through everything as is. The mapping actually used for setup is
- * stored separately (but calculated here too at the same time).
+ * This will trigger validation of the vertex layout (and also compute
+ * the required information for setup).
  */
 struct vertex_info *
 softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 {
    softpipe_compute_vertex_info(softpipe);
-   return &softpipe->vertex_info_vbuf;
+   return &softpipe->vertex_info;
 }
 
 

From 3e8f644ed31809ff558dc635581ba7abf6a16776 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 5 Jan 2016 05:34:24 -0800
Subject: [PATCH 151/241] glsl: Disallow vectorization of
 vector_insert/extract.

vector_insert takes a vector, a scalar location, and a scalar value,
and produces a new vector with that component updated.  As such, it
can't be vectorized properly.

vector_extract takes a vector and a scalar location, and returns
that scalar component of the vector.  Vectorization doesn't really
make any sense.

Treating both as horizontal operations makes sure the vectorizer
won't try to touch these.

Found by inspection.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/ir.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index c56c95994b8..a728c036e6b 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1724,6 +1724,8 @@ public:
       return operation == ir_binop_all_equal ||
              operation == ir_binop_any_nequal ||
              operation == ir_binop_dot ||
+             operation == ir_binop_vector_extract ||
+             operation == ir_triop_vector_insert ||
              operation == ir_quadop_vector;
    }
 

From f3658be108aa4637aec44d544164d772774ae165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 23 Dec 2015 03:01:32 +0100
Subject: [PATCH 152/241] tgsi/scan: set if a fragment shader writes sample
 mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will be used by radeonsi.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 2 ++
 src/gallium/auxiliary/tgsi/tgsi_scan.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e04f4076e9c..e3feed9aa98 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -392,6 +392,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                      }
                      else if (semName == TGSI_SEMANTIC_STENCIL) {
                         info->writes_stencil = TRUE;
+                     } else if (semName == TGSI_SEMANTIC_SAMPLEMASK) {
+                        info->writes_samplemask = TRUE;
                      }
                   }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 7e9a5597db2..a3e437809aa 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -82,6 +82,7 @@ struct tgsi_shader_info
    boolean reads_z; /**< does fragment shader read depth? */
    boolean writes_z;  /**< does fragment shader write Z value? */
    boolean writes_stencil; /**< does fragment shader write stencil value? */
+   boolean writes_samplemask; /**< does fragment shader write sample mask? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KILL or KILL_IF instruction used? */
    boolean uses_persp_center;

From 18ec76730a75b6fbc2783d6a84a844cb038456ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 2 Jan 2016 17:28:19 +0100
Subject: [PATCH 153/241] tgsi/scan: fix tgsi_shader_info::reads_z
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This has no users in Mesa.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e3feed9aa98..e3a6fb0437c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -187,8 +187,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   }
 
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      info->reads_position &&
-                      src->Register.Index == 0 &&
+		      !src->Register.Indirect &&
+		      info->input_semantic_name[src->Register.Index] ==
+		      TGSI_SEMANTIC_POSITION &&
                       (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
                        src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
                        src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||

From ff7e77724e0cf1409f84118d0d4ceee65535ccdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 1 Jan 2016 19:42:44 +0100
Subject: [PATCH 154/241] tgsi/scan: set which color components are read by a
 fragment shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will be used by radeonsi.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 30 +++++++++++++++++++-------
 src/gallium/auxiliary/tgsi/tgsi_scan.h |  1 +
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e3a6fb0437c..6ea32eedd74 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -187,14 +187,28 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   }
 
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
-		      !src->Register.Indirect &&
-		      info->input_semantic_name[src->Register.Index] ==
-		      TGSI_SEMANTIC_POSITION &&
-                      (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleW == TGSI_SWIZZLE_Z)) {
-                     info->reads_z = TRUE;
+                      !src->Register.Indirect) {
+                     unsigned name =
+                        info->input_semantic_name[src->Register.Index];
+                     unsigned index =
+                        info->input_semantic_index[src->Register.Index];
+
+                     if (name == TGSI_SEMANTIC_POSITION &&
+                         (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleW == TGSI_SWIZZLE_Z))
+                        info->reads_z = TRUE;
+
+                     if (name == TGSI_SEMANTIC_COLOR) {
+                        unsigned mask =
+                              (1 << src->Register.SwizzleX) |
+                              (1 << src->Register.SwizzleY) |
+                              (1 << src->Register.SwizzleZ) |
+                              (1 << src->Register.SwizzleW);
+
+                        info->colors_read |= mask << (index * 4);
+                     }
                   }
                }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index a3e437809aa..b0b423ab528 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -77,6 +77,7 @@ struct tgsi_shader_info
 
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
+   ubyte colors_read; /**< which color components are read by the FS */
    ubyte colors_written;
    boolean reads_position; /**< does fragment shader read position? */
    boolean reads_z; /**< does fragment shader read depth? */

From 2cb8bf90cd21cdeba708ed4ee875a9e4ada128ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 23 Dec 2015 15:36:05 +0100
Subject: [PATCH 155/241] radeonsi: determine DB_SHADER_CONTROL outside of
 shader compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

because the API pixel shader binary will not emulate alpha test one day,
so the KILL_ENABLE bit must be determined elsewhere.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 20 ---------
 src/gallium/drivers/radeonsi/si_shader.h      |  5 ++-
 .../drivers/radeonsi/si_state_shaders.c       | 43 ++++++++++++++++---
 3 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 426f40fe8f5..4b49f9d6be3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1390,8 +1390,6 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 				LLVMVoidTypeInContext(gallivm->context),
 				NULL, 0, 0);
 	}
-
-	si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
 }
 
 static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
@@ -2229,22 +2227,18 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
 			args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
 			mask |= 0x1;
-			si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
 		}
 
 		if (stencil_index >= 0) {
 			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
 			args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
 			mask |= 0x2;
-			si_shader_ctx->shader->db_shader_control |=
-				S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1);
 		}
 
 		if (samplemask_index >= 0) {
 			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
 			args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
 			mask |= 0x4;
-			si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1);
 		}
 
 		/* SI (except OLAND) has a bug that it only looks
@@ -4113,9 +4107,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	if (sel->type != PIPE_SHADER_COMPUTE)
 		shader->dx10_clamp_mode = true;
 
-	if (sel->info.uses_kill)
-		shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
-
 	shader->uses_instanceid = sel->info.uses_instanceid;
 	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -4190,17 +4181,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	case TGSI_PROCESSOR_FRAGMENT:
 		si_shader_ctx.radeon_bld.load_input = declare_input_fs;
 		bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
-
-		switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
-		case TGSI_FS_DEPTH_LAYOUT_GREATER:
-			shader->db_shader_control |=
-				S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
-			break;
-		case TGSI_FS_DEPTH_LAYOUT_LESS:
-			shader->db_shader_control |=
-				S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
-			break;
-		}
 		break;
 	default:
 		assert(!"Unsupported shader type");
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d377a2a2cfd..067704fd838 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -201,6 +201,7 @@ struct si_shader_selector {
 	bool		forces_persample_interp_for_persp;
 	bool		forces_persample_interp_for_linear;
 
+	/* GS parameters. */
 	unsigned	esgs_itemsize;
 	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
@@ -210,6 +211,9 @@ struct si_shader_selector {
 	unsigned	gsvs_vertex_size;
 	unsigned	max_gsvs_emit_size;
 
+	/* PS parameters. */
+	unsigned	db_shader_control;
+
 	/* masks of "get_unique_index" bits */
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
@@ -275,7 +279,6 @@ struct si_shader {
 	unsigned			scratch_bytes_per_wave;
 	unsigned			spi_shader_col_format;
 	unsigned			spi_shader_z_format;
-	unsigned			db_shader_control;
 	unsigned			cb_shader_mask;
 	union si_shader_key		key;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index c7045c31d56..41e331b6ba8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -496,6 +496,16 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 	}
 }
 
+static unsigned si_get_alpha_test_func(struct si_context *sctx)
+{
+	/* Alpha-test should be disabled if colorbuffer 0 is integer. */
+	if (sctx->queued.named.dsa &&
+	    !sctx->framebuffer.cb0_is_integer)
+		return sctx->queued.named.dsa->alpha_func;
+
+	return PIPE_FUNC_ALWAYS;
+}
+
 /* Compute the key for the hw shader variant */
 static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
@@ -562,11 +572,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			key->ps.clamp_color = rs->clamp_fragment_color;
 		}
 
-		key->ps.alpha_func = PIPE_FUNC_ALWAYS;
-		/* Alpha-test should be disabled if colorbuffer 0 is integer. */
-		if (sctx->queued.named.dsa &&
-		    !sctx->framebuffer.cb0_is_integer)
-			key->ps.alpha_func = sctx->queued.named.dsa->alpha_func;
+		key->ps.alpha_func = si_get_alpha_test_func(sctx);
 		break;
 	}
 	default:
@@ -731,6 +737,25 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
+	/* DB_SHADER_CONTROL */
+	sel->db_shader_control =
+		S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+		S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+		S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+	switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+	case TGSI_FS_DEPTH_LAYOUT_GREATER:
+		sel->db_shader_control |=
+			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_LESS:
+		sel->db_shader_control |=
+			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+		break;
+	}
+
+	/* Pre-compilation. */
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
 		struct si_shader_ctx_state state = {sel};
 
@@ -1549,6 +1574,10 @@ bool si_update_shaders(struct si_context *sctx)
 	si_update_vgt_shader_config(sctx);
 
 	if (sctx->ps_shader.cso) {
+		unsigned db_shader_control =
+			sctx->ps_shader.cso->db_shader_control |
+			S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
 		r = si_shader_select(ctx, &sctx->ps_shader);
 		if (r)
 			return false;
@@ -1568,8 +1597,8 @@ bool si_update_shaders(struct si_context *sctx)
 			si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
 		}
 
-		if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
-			sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
+		if (sctx->ps_db_shader_control != db_shader_control) {
+			sctx->ps_db_shader_control = db_shader_control;
 			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 		}
 

From 746a7a74983a708edfd60ac2e501939844eacfc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 23 Dec 2015 16:02:46 +0100
Subject: [PATCH 156/241] radeonsi: determine SPI_SHADER_Z_FORMAT outside of
 shader compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c        | 7 -------
 src/gallium/drivers/radeonsi/si_shader.h        | 1 -
 src/gallium/drivers/radeonsi/si_state_shaders.c | 7 ++++++-
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4b49f9d6be3..b7c44b9d9aa 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2247,13 +2247,6 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		    si_shader_ctx->screen->b.family != CHIP_OLAND)
 			mask |= 0x1;
 
-		if (samplemask_index >= 0)
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR;
-		else if (stencil_index >= 0)
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR;
-		else
-			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
-
 		/* Specify which components to enable */
 		args[0] = lp_build_const_int32(base->gallivm, mask);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 067704fd838..3d14c79b803 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -278,7 +278,6 @@ struct si_shader {
 	unsigned			float_mode;
 	unsigned			scratch_bytes_per_wave;
 	unsigned			spi_shader_col_format;
-	unsigned			spi_shader_z_format;
 	unsigned			cb_shader_mask;
 	union si_shader_key		key;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 41e331b6ba8..61db8ef714c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -431,7 +431,12 @@ static void si_shader_ps(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 	si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
 
-	si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, shader->spi_shader_z_format);
+	si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT,
+		       info->writes_samplemask ? V_028710_SPI_SHADER_32_ABGR :
+		       info->writes_stencil ? V_028710_SPI_SHADER_32_GR :
+		       info->writes_z ? V_028710_SPI_SHADER_32_R :
+		       V_028710_SPI_SHADER_ZERO);
+
 	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT,
 		       shader->spi_shader_col_format);
 	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask);

From 4e597c25c76e502e5fd32900feb1ea953cf03338 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 23 Dec 2015 16:24:02 +0100
Subject: [PATCH 157/241] radeonsi: write all MRTs only if there is exactly one
 output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This doesn't fix a known bug, but better safe than sorry.

Also, simplify the expression in si_shader.c.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c        | 5 ++---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b7c44b9d9aa..f322c4e3f5a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2189,9 +2189,8 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			/* This instruction will be emitted at the end of the shader. */
 			memcpy(last_args, args, sizeof(args));
 
-			/* Handle FS_COLOR0_WRITES_ALL_CBUFS. */
-			if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-			    semantic_index == 0 &&
+			/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+			if (semantic_index == 0 &&
 			    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
 				for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
 					si_llvm_init_export_args_load(bld_base,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 61db8ef714c..4b007ec8878 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -552,8 +552,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+		    sel->info.colors_written == 0x1)
 			key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
 		key->ps.export_16bpc = sctx->framebuffer.export_16bpc;
 
 		if (rs) {

From e00f3f23b13e2ad99977005da2c1538f181e401d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 23 Dec 2015 16:43:54 +0100
Subject: [PATCH 158/241] radeonsi: set SPI color formats and CB_SHADER_MASK
 outside of compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 12 +-------
 src/gallium/drivers/radeonsi/si_shader.h      |  2 --
 .../drivers/radeonsi/si_state_shaders.c       | 30 +++++++++++++++++--
 3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f322c4e3f5a..85113c00f5b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1302,18 +1302,8 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 	if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 		int cbuf = target - V_008DFC_SQ_EXP_MRT;
 
-		if (cbuf >= 0 && cbuf < 8) {
+		if (cbuf >= 0 && cbuf < 8)
 			compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
-
-			if (compressed)
-				si_shader_ctx->shader->spi_shader_col_format |=
-					V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf);
-			else
-				si_shader_ctx->shader->spi_shader_col_format |=
-					V_028714_SPI_SHADER_32_ABGR << (4 * cbuf);
-
-			si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf);
-		}
 	}
 
 	/* Set COMPR flag */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 3d14c79b803..b89d3b29e69 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -277,8 +277,6 @@ struct si_shader {
 	unsigned			spi_ps_input_ena;
 	unsigned			float_mode;
 	unsigned			scratch_bytes_per_wave;
-	unsigned			spi_shader_col_format;
-	unsigned			cb_shader_mask;
 	union si_shader_key		key;
 
 	unsigned		nparam;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4b007ec8878..b08b035f62f 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -387,6 +387,8 @@ static void si_shader_ps(struct si_shader *shader)
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned i, spi_ps_in_control;
+	unsigned spi_shader_col_format = 0, cb_shader_mask = 0;
+	unsigned colors_written, export_16bpc;
 	unsigned num_sgprs, num_user_sgprs;
 	unsigned spi_baryc_cntl = 0;
 	uint64_t va;
@@ -422,12 +424,35 @@ static void si_shader_ps(struct si_shader *shader)
 		}
 	}
 
+	/* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */
+	colors_written = info->colors_written;
+	export_16bpc = shader->key.ps.export_16bpc;
+
+	if (info->colors_written == 0x0) {
+		colors_written = 0x1; /* dummy export */
+		export_16bpc = 0;
+	} else if (info->colors_written == 0x1 &&
+		   info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
+		colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1;
+	}
+
+	while (colors_written) {
+		i = u_bit_scan(&colors_written);
+		if (export_16bpc & (1 << i))
+			spi_shader_col_format |= V_028714_SPI_SHADER_FP16_ABGR << (4 * i);
+		else
+			spi_shader_col_format |= V_028714_SPI_SHADER_32_ABGR << (4 * i);
+		cb_shader_mask |= 0xf << (4 * i);
+	}
+
+	/* Set interpolation controls. */
 	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
 		       G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
 
 	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
 			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 
+	/* Set registers. */
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 	si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
 
@@ -437,9 +462,8 @@ static void si_shader_ps(struct si_shader *shader)
 		       info->writes_z ? V_028710_SPI_SHADER_32_R :
 		       V_028710_SPI_SHADER_ZERO);
 
-	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT,
-		       shader->spi_shader_col_format);
-	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask);
+	si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, spi_shader_col_format);
+	si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, cb_shader_mask);
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);

From 5f3e6b5b0f0665c6593cdc59efdb11b0ef6063c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 23 Dec 2015 18:06:04 +0100
Subject: [PATCH 159/241] radeonsi: simplify setting the DONE bit for PS
 exports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First find out what the last export is and simply set the DONE bit there.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      | 126 ++++++++----------
 .../drivers/radeonsi/si_state_shaders.c       |   2 +-
 2 files changed, 55 insertions(+), 73 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 85113c00f5b..8441fb42c74 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2109,10 +2109,36 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct tgsi_shader_info *info = &shader->selector->info;
 	LLVMBuilderRef builder = base->gallivm->builder;
 	LLVMValueRef args[9];
-	LLVMValueRef last_args[9] = { 0 };
 	int depth_index = -1, stencil_index = -1, samplemask_index = -1;
+	int last_color_export = -1;
 	int i;
 
+	/* If there are no outputs, add a dummy export. */
+	if (!info->num_outputs) {
+		args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+		args[1] = uint->one; /* whether the EXEC mask is valid */
+		args[2] = uint->one; /* DONE bit */
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
+		args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+		args[5] = uint->zero; /* R */
+		args[6] = uint->zero; /* G */
+		args[7] = uint->zero; /* B */
+		args[8] = uint->zero; /* A */
+
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   LLVMVoidTypeInContext(base->gallivm->context),
+				   args, 9, 0);
+		return;
+	}
+
+	/* Determine the last export. If MRTZ is present, it's always last.
+	 * Otherwise, find the last color export.
+	 */
+	if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask)
+		for (i = 0; i < info->num_outputs; i++)
+			if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR)
+				last_color_export = i;
+
 	for (i = 0; i < info->num_outputs; i++) {
 		unsigned semantic_name = info->output_semantic_name[i];
 		unsigned semantic_index = info->output_semantic_index[i];
@@ -2157,56 +2183,48 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 
 			break;
 		default:
-			target = 0;
 			fprintf(stderr,
 				"Warning: SI unhandled fs output type:%d\n",
 				semantic_name);
+			continue;
+		}
+
+		/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+		if (semantic_index == 0 &&
+		    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
+			for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
+				si_llvm_init_export_args_load(bld_base,
+							      si_shader_ctx->radeon_bld.soa.outputs[i],
+							      V_008DFC_SQ_EXP_MRT + c, args);
+				lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+						   LLVMVoidTypeInContext(base->gallivm->context),
+						   args, 9, 0);
+			}
 		}
 
 		si_llvm_init_export_args_load(bld_base,
 					      si_shader_ctx->radeon_bld.soa.outputs[i],
 					      target, args);
-
-		if (semantic_name == TGSI_SEMANTIC_COLOR) {
-			/* If there is an export instruction waiting to be emitted, do so now. */
-			if (last_args[0]) {
-				lp_build_intrinsic(base->gallivm->builder,
-						   "llvm.SI.export",
-						   LLVMVoidTypeInContext(base->gallivm->context),
-						   last_args, 9, 0);
-			}
-
-			/* This instruction will be emitted at the end of the shader. */
-			memcpy(last_args, args, sizeof(args));
-
-			/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-			if (semantic_index == 0 &&
-			    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
-				for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
-					si_llvm_init_export_args_load(bld_base,
-								      si_shader_ctx->radeon_bld.soa.outputs[i],
-								      V_008DFC_SQ_EXP_MRT + c, args);
-					lp_build_intrinsic(base->gallivm->builder,
-							   "llvm.SI.export",
-							   LLVMVoidTypeInContext(base->gallivm->context),
-							   args, 9, 0);
-				}
-			}
-		} else {
-			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
-					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9, 0);
+		if (last_color_export == i) {
+			args[1] = uint->one; /* whether the EXEC mask is valid */
+			args[2] = uint->one; /* DONE bit */
 		}
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   LLVMVoidTypeInContext(base->gallivm->context),
+				   args, 9, 0);
 	}
 
 	if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
 		LLVMValueRef out_ptr;
 		unsigned mask = 0;
 
+		args[1] = uint->one; /* whether the EXEC mask is valid */
+		args[2] = uint->one; /* DONE bit */
+
 		/* Specify the target we are exporting */
 		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
 
+		args[4] = uint->zero; /* COMP flag */
 		args[5] = base->zero; /* R, depth */
 		args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
 		args[7] = base->zero; /* B, sample mask */
@@ -2239,46 +2257,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		/* Specify which components to enable */
 		args[0] = lp_build_const_int32(base->gallivm, mask);
 
-		args[1] =
-		args[2] =
-		args[4] = uint->zero;
-
-		if (last_args[0])
-			lp_build_intrinsic(base->gallivm->builder,
-					   "llvm.SI.export",
-					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9, 0);
-		else
-			memcpy(last_args, args, sizeof(args));
+		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+				   LLVMVoidTypeInContext(base->gallivm->context),
+				   args, 9, 0);
 	}
-
-	if (!last_args[0]) {
-		/* Specify which components to enable */
-		last_args[0] = lp_build_const_int32(base->gallivm, 0x0);
-
-		/* Specify the target we are exporting */
-		last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
-
-		/* Set COMPR flag to zero to export data as 32-bit */
-		last_args[4] = uint->zero;
-
-		/* dummy bits */
-		last_args[5]= uint->zero;
-		last_args[6]= uint->zero;
-		last_args[7]= uint->zero;
-		last_args[8]= uint->zero;
-	}
-
-	/* Specify whether the EXEC mask represents the valid mask */
-	last_args[1] = uint->one;
-
-	/* Specify that this is the last export */
-	last_args[2] = lp_build_const_int32(base->gallivm, 1);
-
-	lp_build_intrinsic(base->gallivm->builder,
-			   "llvm.SI.export",
-			   LLVMVoidTypeInContext(base->gallivm->context),
-			   last_args, 9, 0);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b08b035f62f..68ba7ec00b4 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -428,7 +428,7 @@ static void si_shader_ps(struct si_shader *shader)
 	colors_written = info->colors_written;
 	export_16bpc = shader->key.ps.export_16bpc;
 
-	if (info->colors_written == 0x0) {
+	if (!info->num_outputs) {
 		colors_written = 0x1; /* dummy export */
 		export_16bpc = 0;
 	} else if (info->colors_written == 0x1 &&

From 1ce659f8209b721bf090336333f3030536cd853b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 17:38:37 +0100
Subject: [PATCH 160/241] radeonsi: move MRTZ export into a separate function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 113 +++++++++++++----------
 1 file changed, 62 insertions(+), 51 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 8441fb42c74..e08a07618d8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2100,6 +2100,59 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	FREE(outputs);
 }
 
+static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
+			   LLVMValueRef depth, LLVMValueRef stencil,
+			   LLVMValueRef samplemask)
+{
+	struct si_screen *sscreen = si_shader_context(bld_base)->screen;
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint = &bld_base->uint_bld;
+	LLVMValueRef args[9];
+	unsigned mask = 0;
+
+	assert(depth || stencil || samplemask);
+
+	args[1] = uint->one; /* whether the EXEC mask is valid */
+	args[2] = uint->one; /* DONE bit */
+
+	/* Specify the target we are exporting */
+	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
+
+	args[4] = uint->zero; /* COMP flag */
+	args[5] = base->zero; /* R, depth */
+	args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
+	args[7] = base->zero; /* B, sample mask */
+	args[8] = base->zero; /* A, alpha to mask */
+
+	if (depth) {
+		args[5] = depth;
+		mask |= 0x1;
+	}
+
+	if (stencil) {
+		args[6] = stencil;
+		mask |= 0x2;
+	}
+
+	if (samplemask) {
+		args[7] = samplemask;
+		mask |= 0x4;
+	}
+
+	/* SI (except OLAND) has a bug that it only looks
+	 * at the X writemask component. */
+	if (sscreen->b.chip_class == SI &&
+	    sscreen->b.family != CHIP_OLAND)
+		mask |= 0x1;
+
+	/* Specify which components to enable */
+	args[0] = lp_build_const_int32(base->gallivm, mask);
+
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
+
 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 {
 	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
@@ -2109,7 +2162,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct tgsi_shader_info *info = &shader->selector->info;
 	LLVMBuilderRef builder = base->gallivm->builder;
 	LLVMValueRef args[9];
-	int depth_index = -1, stencil_index = -1, samplemask_index = -1;
+	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
 	int last_color_export = -1;
 	int i;
 
@@ -2148,13 +2201,16 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		/* Select the correct target */
 		switch (semantic_name) {
 		case TGSI_SEMANTIC_POSITION:
-			depth_index = i;
+			depth = LLVMBuildLoad(builder,
+					      si_shader_ctx->radeon_bld.soa.outputs[i][2], "");
 			continue;
 		case TGSI_SEMANTIC_STENCIL:
-			stencil_index = i;
+			stencil = LLVMBuildLoad(builder,
+						si_shader_ctx->radeon_bld.soa.outputs[i][1], "");
 			continue;
 		case TGSI_SEMANTIC_SAMPLEMASK:
-			samplemask_index = i;
+			samplemask = LLVMBuildLoad(builder,
+						   si_shader_ctx->radeon_bld.soa.outputs[i][0], "");
 			continue;
 		case TGSI_SEMANTIC_COLOR:
 			target = V_008DFC_SQ_EXP_MRT + semantic_index;
@@ -2214,53 +2270,8 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 				   args, 9, 0);
 	}
 
-	if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
-		LLVMValueRef out_ptr;
-		unsigned mask = 0;
-
-		args[1] = uint->one; /* whether the EXEC mask is valid */
-		args[2] = uint->one; /* DONE bit */
-
-		/* Specify the target we are exporting */
-		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
-
-		args[4] = uint->zero; /* COMP flag */
-		args[5] = base->zero; /* R, depth */
-		args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
-		args[7] = base->zero; /* B, sample mask */
-		args[8] = base->zero; /* A, alpha to mask */
-
-		if (depth_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
-			args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x1;
-		}
-
-		if (stencil_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
-			args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x2;
-		}
-
-		if (samplemask_index >= 0) {
-			out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
-			args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-			mask |= 0x4;
-		}
-
-		/* SI (except OLAND) has a bug that it only looks
-		 * at the X writemask component. */
-		if (si_shader_ctx->screen->b.chip_class == SI &&
-		    si_shader_ctx->screen->b.family != CHIP_OLAND)
-			mask |= 0x1;
-
-		/* Specify which components to enable */
-		args[0] = lp_build_const_int32(base->gallivm, mask);
-
-		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-				   LLVMVoidTypeInContext(base->gallivm->context),
-				   args, 9, 0);
-	}
+	if (depth || stencil || samplemask)
+		si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,

From 185267a6fd6f021ed4e5b3647a4d57c3ca6ea86b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 17:45:52 +0100
Subject: [PATCH 161/241] radeonsi: export "undef" values for undefined PS
 outputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index e08a07618d8..73a34ac254e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1323,11 +1323,12 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 						    LLVMInt32TypeInContext(base->gallivm->context),
 						    pack_args, 2,
 						    LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
-			args[chan + 7] = args[chan + 5] =
+			args[chan + 5] =
 				LLVMBuildBitCast(base->gallivm->builder,
 						 packed,
 						 LLVMFloatTypeInContext(base->gallivm->context),
 						 "");
+			args[chan + 7] = base->undef;
 		}
 	} else
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
@@ -2119,10 +2120,10 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
 
 	args[4] = uint->zero; /* COMP flag */
-	args[5] = base->zero; /* R, depth */
-	args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
-	args[7] = base->zero; /* B, sample mask */
-	args[8] = base->zero; /* A, alpha to mask */
+	args[5] = base->undef; /* R, depth */
+	args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
+	args[7] = base->undef; /* B, sample mask */
+	args[8] = base->undef; /* A, alpha to mask */
 
 	if (depth) {
 		args[5] = depth;
@@ -2173,10 +2174,10 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		args[2] = uint->one; /* DONE bit */
 		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
 		args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
-		args[5] = uint->zero; /* R */
-		args[6] = uint->zero; /* G */
-		args[7] = uint->zero; /* B */
-		args[8] = uint->zero; /* A */
+		args[5] = uint->undef; /* R */
+		args[6] = uint->undef; /* G */
+		args[7] = uint->undef; /* B */
+		args[8] = uint->undef; /* A */
 
 		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
 				   LLVMVoidTypeInContext(base->gallivm->context),

From 677c65968b636793bc5e928615fdd12491066ded Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 17:53:44 +0100
Subject: [PATCH 162/241] radeonsi: only use LLVMBuildLoad once when updating
 color outputs at the end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

without LLVMBuildStore.

So:
- do LLVMBuildLoad
- update the values as necessary
- export

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 67 +++++++-----------------
 1 file changed, 20 insertions(+), 47 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 73a34ac254e..13e5140d4db 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1334,24 +1334,8 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 		memcpy(&args[5], values, sizeof(values[0]) * 4);
 }
 
-/* Load from output pointers and initialize arguments for the shader export intrinsic */
-static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base,
-					  LLVMValueRef *out_ptr,
-					  unsigned target,
-					  LLVMValueRef *args)
-{
-	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMValueRef values[4];
-	int i;
-
-	for (i = 0; i < 4; i++)
-		values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], "");
-
-	si_llvm_init_export_args(bld_base, values, target, args);
-}
-
 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
-			  LLVMValueRef alpha_ptr)
+			  LLVMValueRef alpha)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1363,8 +1347,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 		LLVMValueRef alpha_pass =
 			lp_build_cmp(&bld_base->base,
 				     si_shader_ctx->shader->key.ps.alpha_func,
-				     LLVMBuildLoad(gallivm->builder, alpha_ptr, ""),
-				     alpha_ref);
+				     alpha, alpha_ref);
 		LLVMValueRef arg =
 			lp_build_select(&bld_base->base,
 					alpha_pass,
@@ -1383,12 +1366,12 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 	}
 }
 
-static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-					  LLVMValueRef alpha_ptr)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
+						  LLVMValueRef alpha)
 {
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
-	LLVMValueRef coverage, alpha;
+	LLVMValueRef coverage;
 
 	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
 	coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -1406,9 +1389,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
 				 lp_build_const_float(gallivm,
 					1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 
-	alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, "");
-	alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
-	LLVMBuildStore(gallivm->builder, alpha, alpha_ptr);
+	return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
 }
 
 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
@@ -2196,8 +2177,8 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	for (i = 0; i < info->num_outputs; i++) {
 		unsigned semantic_name = info->output_semantic_name[i];
 		unsigned semantic_index = info->output_semantic_index[i];
-		unsigned target;
-		LLVMValueRef alpha_ptr;
+		unsigned target, j;
+		LLVMValueRef color[4] = {};
 
 		/* Select the correct target */
 		switch (semantic_name) {
@@ -2215,29 +2196,24 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			continue;
 		case TGSI_SEMANTIC_COLOR:
 			target = V_008DFC_SQ_EXP_MRT + semantic_index;
-			alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
 
-			if (si_shader_ctx->shader->key.ps.clamp_color) {
-				for (int j = 0; j < 4; j++) {
-					LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
-					LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+			for (j = 0; j < 4; j++)
+				color[j] = LLVMBuildLoad(builder,
+							 si_shader_ctx->radeon_bld.soa.outputs[i][j], "");
 
-					result = radeon_llvm_saturate(bld_base, result);
-					LLVMBuildStore(builder, result, ptr);
-				}
-			}
+			if (si_shader_ctx->shader->key.ps.clamp_color)
+				for (j = 0; j < 4; j++)
+					color[j] = radeon_llvm_saturate(bld_base, color[j]);
 
 			if (si_shader_ctx->shader->key.ps.alpha_to_one)
-				LLVMBuildStore(base->gallivm->builder,
-					       base->one, alpha_ptr);
+				color[3] = base->one;
 
 			if (semantic_index == 0 &&
 			    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
-				si_alpha_test(bld_base, alpha_ptr);
+				si_alpha_test(bld_base, color[3]);
 
 			if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-				si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
-
+				color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
 			break;
 		default:
 			fprintf(stderr,
@@ -2250,18 +2226,15 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		if (semantic_index == 0 &&
 		    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
 			for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
-				si_llvm_init_export_args_load(bld_base,
-							      si_shader_ctx->radeon_bld.soa.outputs[i],
-							      V_008DFC_SQ_EXP_MRT + c, args);
+				si_llvm_init_export_args(bld_base, color,
+							 V_008DFC_SQ_EXP_MRT + c, args);
 				lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
 						   LLVMVoidTypeInContext(base->gallivm->context),
 						   args, 9, 0);
 			}
 		}
 
-		si_llvm_init_export_args_load(bld_base,
-					      si_shader_ctx->radeon_bld.soa.outputs[i],
-					      target, args);
+		si_llvm_init_export_args(bld_base, color, target, args);
 		if (last_color_export == i) {
 			args[1] = uint->one; /* whether the EXEC mask is valid */
 			args[2] = uint->one; /* DONE bit */

From 0ffe3d3772eaa5f14890a2fd2b77b173cb669f3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 19:36:33 +0100
Subject: [PATCH 163/241] radeonsi: use EXP_NULL for pixel shaders without
 outputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This never happens currently.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c        | 2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 13e5140d4db..4204db02128 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2153,7 +2153,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
 		args[1] = uint->one; /* whether the EXEC mask is valid */
 		args[2] = uint->one; /* DONE bit */
-		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
+		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
 		args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
 		args[5] = uint->undef; /* R */
 		args[6] = uint->undef; /* G */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 68ba7ec00b4..af21f3e054c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -428,11 +428,8 @@ static void si_shader_ps(struct si_shader *shader)
 	colors_written = info->colors_written;
 	export_16bpc = shader->key.ps.export_16bpc;
 
-	if (!info->num_outputs) {
-		colors_written = 0x1; /* dummy export */
-		export_16bpc = 0;
-	} else if (info->colors_written == 0x1 &&
-		   info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
+	if (info->colors_written == 0x1 &&
+	    info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
 		colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1;
 	}
 

From a72ed2f6bc3c4e2aa1a317d960d009ff2dda0bc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 20:02:41 +0100
Subject: [PATCH 164/241] radeonsi: move MRT color exporting into a separate
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will be used by a fragment shader epilog.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 98 ++++++++++++++----------
 1 file changed, 57 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4204db02128..69c5d62f73d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2135,6 +2135,57 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
 			   args, 9, 0);
 }
 
+static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
+				LLVMValueRef *color, unsigned index,
+				bool is_last)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct lp_build_context *base = &bld_base->base;
+	LLVMValueRef args[9];
+	int i;
+
+	/* Clamp color */
+	if (si_shader_ctx->shader->key.ps.clamp_color)
+		for (i = 0; i < 4; i++)
+			color[i] = radeon_llvm_saturate(bld_base, color[i]);
+
+	/* Alpha to one */
+	if (si_shader_ctx->shader->key.ps.alpha_to_one)
+		color[3] = base->one;
+
+	/* Alpha test */
+	if (index == 0 &&
+	    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+		si_alpha_test(bld_base, color[3]);
+
+	/* Line & polygon smoothing */
+	if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
+		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+
+	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+	if (index == 0 &&
+	    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
+		for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
+			si_llvm_init_export_args(bld_base, color,
+						 V_008DFC_SQ_EXP_MRT + c, args);
+			lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+					   LLVMVoidTypeInContext(base->gallivm->context),
+					   args, 9, 0);
+		}
+	}
+
+	/* Export */
+	si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
+				 args);
+	if (is_last) {
+		args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
+		args[2] = bld_base->uint_bld.one; /* DONE bit */
+	}
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
+
 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 {
 	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
@@ -2177,7 +2228,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	for (i = 0; i < info->num_outputs; i++) {
 		unsigned semantic_name = info->output_semantic_name[i];
 		unsigned semantic_index = info->output_semantic_index[i];
-		unsigned target, j;
+		unsigned j;
 		LLVMValueRef color[4] = {};
 
 		/* Select the correct target */
@@ -2185,63 +2236,28 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		case TGSI_SEMANTIC_POSITION:
 			depth = LLVMBuildLoad(builder,
 					      si_shader_ctx->radeon_bld.soa.outputs[i][2], "");
-			continue;
+			break;
 		case TGSI_SEMANTIC_STENCIL:
 			stencil = LLVMBuildLoad(builder,
 						si_shader_ctx->radeon_bld.soa.outputs[i][1], "");
-			continue;
+			break;
 		case TGSI_SEMANTIC_SAMPLEMASK:
 			samplemask = LLVMBuildLoad(builder,
 						   si_shader_ctx->radeon_bld.soa.outputs[i][0], "");
-			continue;
+			break;
 		case TGSI_SEMANTIC_COLOR:
-			target = V_008DFC_SQ_EXP_MRT + semantic_index;
-
 			for (j = 0; j < 4; j++)
 				color[j] = LLVMBuildLoad(builder,
 							 si_shader_ctx->radeon_bld.soa.outputs[i][j], "");
 
-			if (si_shader_ctx->shader->key.ps.clamp_color)
-				for (j = 0; j < 4; j++)
-					color[j] = radeon_llvm_saturate(bld_base, color[j]);
-
-			if (si_shader_ctx->shader->key.ps.alpha_to_one)
-				color[3] = base->one;
-
-			if (semantic_index == 0 &&
-			    si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
-				si_alpha_test(bld_base, color[3]);
-
-			if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-				color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+			si_export_mrt_color(bld_base, color, semantic_index,
+					    last_color_export == i);
 			break;
 		default:
 			fprintf(stderr,
 				"Warning: SI unhandled fs output type:%d\n",
 				semantic_name);
-			continue;
 		}
-
-		/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-		if (semantic_index == 0 &&
-		    si_shader_ctx->shader->key.ps.last_cbuf > 0) {
-			for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
-				si_llvm_init_export_args(bld_base, color,
-							 V_008DFC_SQ_EXP_MRT + c, args);
-				lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-						   LLVMVoidTypeInContext(base->gallivm->context),
-						   args, 9, 0);
-			}
-		}
-
-		si_llvm_init_export_args(bld_base, color, target, args);
-		if (last_color_export == i) {
-			args[1] = uint->one; /* whether the EXEC mask is valid */
-			args[2] = uint->one; /* DONE bit */
-		}
-		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-				   LLVMVoidTypeInContext(base->gallivm->context),
-				   args, 9, 0);
 	}
 
 	if (depth || stencil || samplemask)

From 890873d1061973e813969f20f00807a94a7fa2e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 20:05:19 +0100
Subject: [PATCH 165/241] radeonsi: move NULL exporting into a separate
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 37 ++++++++++++++----------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 69c5d62f73d..e40e7c18372 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2186,34 +2186,41 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
 			   args, 9, 0);
 }
 
+static void si_export_null(struct lp_build_tgsi_context *bld_base)
+{
+	struct lp_build_context *base = &bld_base->base;
+	struct lp_build_context *uint = &bld_base->uint_bld;
+	LLVMValueRef args[9];
+
+	args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+	args[1] = uint->one; /* whether the EXEC mask is valid */
+	args[2] = uint->one; /* DONE bit */
+	args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+	args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+	args[5] = uint->undef; /* R */
+	args[6] = uint->undef; /* G */
+	args[7] = uint->undef; /* B */
+	args[8] = uint->undef; /* A */
+
+	lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+			   LLVMVoidTypeInContext(base->gallivm->context),
+			   args, 9, 0);
+}
+
 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 {
 	struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
 	struct si_shader * shader = si_shader_ctx->shader;
 	struct lp_build_context * base = &bld_base->base;
-	struct lp_build_context * uint = &bld_base->uint_bld;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	LLVMBuilderRef builder = base->gallivm->builder;
-	LLVMValueRef args[9];
 	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
 	int last_color_export = -1;
 	int i;
 
 	/* If there are no outputs, add a dummy export. */
 	if (!info->num_outputs) {
-		args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
-		args[1] = uint->one; /* whether the EXEC mask is valid */
-		args[2] = uint->one; /* DONE bit */
-		args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
-		args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
-		args[5] = uint->undef; /* R */
-		args[6] = uint->undef; /* G */
-		args[7] = uint->undef; /* B */
-		args[8] = uint->undef; /* A */
-
-		lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
-				   LLVMVoidTypeInContext(base->gallivm->context),
-				   args, 9, 0);
+		si_export_null(bld_base);
 		return;
 	}
 

From 20b9b5d7f527ca29f603242dc5355bd2e29c654d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Dec 2015 00:14:05 +0100
Subject: [PATCH 166/241] radeonsi: add struct si_shader_config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There will be 1 config per variant, which will be a union of configs
from {prolog, main, epilog}. For now, just add the structure.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c     | 24 ++++-----
 src/gallium/drivers/radeonsi/si_shader.c      | 31 ++++++------
 src/gallium/drivers/radeonsi/si_shader.h      | 23 +++++----
 src/gallium/drivers/radeonsi/si_state_draw.c  |  4 +-
 .../drivers/radeonsi/si_state_shaders.c       | 50 +++++++++----------
 5 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 1c4d6b3683b..8edf4ad7959 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -68,7 +68,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 		unsigned scratch_bytes_needed;
 
 		si_shader_binary_read_config(&program->shader, offset);
-		scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
+		scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
 		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
 	}
 
@@ -86,7 +86,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 	 * to the maximum bytes needed, so it can compute the stride
 	 * correctly.
 	 */
-	program->shader.scratch_bytes_per_wave = scratch_bytes;
+	program->shader.config.scratch_bytes_per_wave = scratch_bytes;
 
 	/* Patch the shader with the scratch buffer address. */
 	si_shader_apply_scratch_relocs(sctx,
@@ -281,12 +281,12 @@ static void si_launch_grid(
 
 	memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
 
-	if (shader->scratch_bytes_per_wave > 0) {
+	if (shader->config.scratch_bytes_per_wave > 0) {
 
 		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
 		            "Total Scratch: %u bytes\n", num_waves_for_scratch,
-			    shader->scratch_bytes_per_wave,
-			    shader->scratch_bytes_per_wave *
+			    shader->config.scratch_bytes_per_wave,
+			    shader->config.scratch_bytes_per_wave *
 			    num_waves_for_scratch);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
@@ -313,7 +313,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va);
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
 		S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64));
+		|  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64));
 
 	si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
 	si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
@@ -361,9 +361,9 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
 
-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1);
 
-	lds_blocks = shader->lds_size;
+	lds_blocks = shader->config.lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
 	 * blocks of 256 bytes, so if there are 4 bytes lds allocated in
 	 * the shader and 4 bytes allocated by the state tracker, then
@@ -377,10 +377,10 @@ static void si_launch_grid(
 
 	assert(lds_blocks <= 0xFF);
 
-	shader->rsrc2 &= C_00B84C_LDS_SIZE;
-	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+	shader->config.rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->config.rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
 
-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
 
 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
@@ -402,7 +402,7 @@ static void si_launch_grid(
 		 * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
 		 */
 		S_00B860_WAVES(num_waves_for_scratch)
-		| S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10))
+		| S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10))
 		;
 
 	si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index e40e7c18372..a92bedb2f7b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3727,25 +3727,25 @@ void si_shader_binary_read_config(struct si_shader *shader,
 		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
 		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
 		case R_00B848_COMPUTE_PGM_RSRC1:
-			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
-			shader->float_mode =  G_00B028_FLOAT_MODE(value);
-			shader->rsrc1 = value;
+			shader->config.num_sgprs = MAX2(shader->config.num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+			shader->config.num_vgprs = MAX2(shader->config.num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+			shader->config.float_mode =  G_00B028_FLOAT_MODE(value);
+			shader->config.rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+			shader->config.lds_size = MAX2(shader->config.lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
-			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
-			shader->rsrc2 = value;
+			shader->config.lds_size = MAX2(shader->config.lds_size, G_00B84C_LDS_SIZE(value));
+			shader->config.rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
-			shader->spi_ps_input_ena = value;
+			shader->config.spi_ps_input_ena = value;
 			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			shader->scratch_bytes_per_wave =
+			shader->config.scratch_bytes_per_wave =
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
@@ -3764,7 +3764,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	uint32_t scratch_rsrc_dword0 = scratch_va;
 	uint32_t scratch_rsrc_dword1 =
 		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+		|  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64);
 
 	for (i = 0 ; i < shader->binary.reloc_count; i++) {
 		const struct radeon_shader_reloc *reloc =
@@ -3866,14 +3866,15 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 		fprintf(stderr, "*** SHADER STATS ***\n"
 			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
 			"Scratch: %d bytes per wave\n********************\n",
-			shader->num_sgprs, shader->num_vgprs, binary->code_size,
-			shader->lds_size, shader->scratch_bytes_per_wave);
+			shader->config.num_sgprs, shader->config.num_vgprs, binary->code_size,
+			shader->config.lds_size, shader->config.scratch_bytes_per_wave);
 	}
 
 	pipe_debug_message(debug, SHADER_INFO,
 			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
-			   shader->num_sgprs, shader->num_vgprs, binary->code_size,
-			   shader->lds_size, shader->scratch_bytes_per_wave);
+			   shader->config.num_sgprs, shader->config.num_vgprs,
+			   binary->code_size, shader->config.lds_size,
+			   shader->config.scratch_bytes_per_wave);
 }
 
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
@@ -3907,7 +3908,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
 	FREE(shader->binary.global_symbol_offsets);
-	if (shader->scratch_bytes_per_wave == 0) {
+	if (shader->config.scratch_bytes_per_wave == 0) {
 		FREE(shader->binary.code);
 		FREE(shader->binary.relocs);
 		memset(&shader->binary, 0,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b89d3b29e69..c892ca32803 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -262,6 +262,17 @@ union si_shader_key {
 	} tes; /* tessellation evaluation shader */
 };
 
+struct si_shader_config {
+	unsigned			num_sgprs;
+	unsigned			num_vgprs;
+	unsigned			lds_size;
+	unsigned			spi_ps_input_ena;
+	unsigned			float_mode;
+	unsigned			scratch_bytes_per_wave;
+	unsigned			rsrc1;
+	unsigned			rsrc2;
+};
+
 struct si_shader {
 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;
@@ -270,14 +281,9 @@ struct si_shader {
 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
-	struct radeon_shader_binary	binary;
-	unsigned			num_sgprs;
-	unsigned			num_vgprs;
-	unsigned			lds_size;
-	unsigned			spi_ps_input_ena;
-	unsigned			float_mode;
-	unsigned			scratch_bytes_per_wave;
 	union si_shader_key		key;
+	struct radeon_shader_binary	binary;
+	struct si_shader_config		config;
 
 	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
@@ -288,9 +294,6 @@ struct si_shader {
 	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
-
-	unsigned		rsrc1;
-	unsigned		rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 87a5afbbc97..91ccd073267 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->rsrc2;
+	ls_rsrc2 = ls->current->config.rsrc2;
 
 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->rsrc1);
+	radeon_emit(cs, ls->current->config.rsrc1);
 	radeon_emit(cs, ls_rsrc2);
 
 	/* Compute userdata SGPRs. */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index af21f3e054c..64adf699604 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -111,7 +111,7 @@ static void si_shader_ls(struct si_shader *shader)
 	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
 
 	num_user_sgprs = SI_LS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	if (num_user_sgprs > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
 		num_sgprs = num_user_sgprs + 2;
@@ -121,12 +121,12 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
 
-	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
 			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
-	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
-			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+	shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+			   S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
 
 static void si_shader_hs(struct si_shader *shader)
@@ -143,7 +143,7 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 
 	num_user_sgprs = SI_TCS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with tessellation factor
 	 * buffer offset. */
 	if ((num_user_sgprs + 1) > num_sgprs) {
@@ -155,12 +155,12 @@ static void si_shader_hs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
-		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B428_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 		       S_00B42C_USER_SGPR(num_user_sgprs) |
-		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_es(struct si_shader *shader)
@@ -187,7 +187,7 @@ static void si_shader_es(struct si_shader *shader)
 	} else
 		unreachable("invalid shader selector type");
 
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 	if ((num_user_sgprs + 1) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -200,13 +200,13 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
-		       S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B328_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 		si_set_tesseval_regs(shader, pm4);
@@ -272,7 +272,7 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
 
 	num_user_sgprs = SI_GS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
 	if ((num_user_sgprs + 2) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -281,12 +281,12 @@ static void si_shader_gs(struct si_shader *shader)
 	assert(num_sgprs <= 104);
 
 	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-		       S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B228_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
 		       S_00B22C_USER_SGPR(num_user_sgprs) |
-		       S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_vs(struct si_shader *shader)
@@ -329,7 +329,7 @@ static void si_shader_vs(struct si_shader *shader)
 	} else
 		unreachable("invalid shader selector type");
 
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	if (num_user_sgprs > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
 		num_sgprs = num_user_sgprs + 2;
@@ -356,7 +356,7 @@ static void si_shader_vs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
-		       S_00B128_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B128_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
@@ -367,7 +367,7 @@ static void si_shader_vs(struct si_shader *shader)
 		       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
 		       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
 		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
-		       S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 	if (window_space)
 		si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 			       S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -443,8 +443,8 @@ static void si_shader_ps(struct si_shader *shader)
 	}
 
 	/* Set interpolation controls. */
-	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
-		       G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
+		       G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
 
 	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
 			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
@@ -468,7 +468,7 @@ static void si_shader_ps(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40);
 
 	num_user_sgprs = SI_PS_NUM_USER_SGPR;
-	num_sgprs = shader->num_sgprs;
+	num_sgprs = shader->config.num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */
 	if ((num_user_sgprs + 1) > num_sgprs) {
 		/* Last 2 reserved SGPRs are used for VCC */
@@ -477,13 +477,13 @@ static void si_shader_ps(struct si_shader *shader)
 	assert(num_sgprs <= 104);
 
 	si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
-		       S_00B028_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
 		       S_00B028_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
-		       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
+		       S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
 		       S_00B02C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
@@ -1040,7 +1040,7 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom
 	if (!ps)
 		return;
 
-	input_ena = ps->spi_ps_input_ena;
+	input_ena = ps->config.spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
 	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
@@ -1269,7 +1269,7 @@ static int si_update_scratch_buffer(struct si_context *sctx,
 		return 0;
 
 	/* This shader doesn't need a scratch buffer */
-	if (shader->scratch_bytes_per_wave == 0)
+	if (shader->config.scratch_bytes_per_wave == 0)
 		return 0;
 
 	/* This shader is already configured to use the current
@@ -1301,7 +1301,7 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
 
 static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
 {
-	return shader ? shader->scratch_bytes_per_wave : 0;
+	return shader ? shader->config.scratch_bytes_per_wave : 0;
 }
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)

From 2d3a96448a6ce28a9955ef7b4d5c62228703e4a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Dec 2015 01:45:00 +0100
Subject: [PATCH 167/241] radeonsi: don't pass si_shader to
 si_shader_binary_read_config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  5 ++--
 src/gallium/drivers/radeonsi/si_shader.c  | 28 +++++++++++------------
 src/gallium/drivers/radeonsi/si_shader.h  |  3 ++-
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 8edf4ad7959..7aedd39115a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -67,7 +67,8 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 				program->shader.binary.global_symbol_offsets[i];
 		unsigned scratch_bytes_needed;
 
-		si_shader_binary_read_config(&program->shader, offset);
+		si_shader_binary_read_config(&program->shader.binary,
+					     &program->shader.config, offset);
 		scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
 		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
 	}
@@ -260,7 +261,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
 	/* Read the config information */
-	si_shader_binary_read_config(shader, pc);
+	si_shader_binary_read_config(&shader->binary, &shader->config, pc);
 #endif
 
 	/* Upload the kernel arguments */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a92bedb2f7b..ac1d3e36866 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3707,19 +3707,19 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-void si_shader_binary_read_config(struct si_shader *shader,
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+				  struct si_shader_config *conf,
 				  unsigned symbol_offset)
 {
 	unsigned i;
 	const unsigned char *config =
-		radeon_shader_binary_config_start(&shader->binary,
-						symbol_offset);
+		radeon_shader_binary_config_start(binary, symbol_offset);
 
 	/* XXX: We may be able to emit some of these values directly rather than
 	 * extracting fields to be emitted later.
 	 */
 
-	for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
+	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
 		unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
 		unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 		switch (reg) {
@@ -3727,25 +3727,25 @@ void si_shader_binary_read_config(struct si_shader *shader,
 		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
 		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
 		case R_00B848_COMPUTE_PGM_RSRC1:
-			shader->config.num_sgprs = MAX2(shader->config.num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-			shader->config.num_vgprs = MAX2(shader->config.num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
-			shader->config.float_mode =  G_00B028_FLOAT_MODE(value);
-			shader->config.rsrc1 = value;
+			conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+			conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+			conf->float_mode =  G_00B028_FLOAT_MODE(value);
+			conf->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-			shader->config.lds_size = MAX2(shader->config.lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+			conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
-			shader->config.lds_size = MAX2(shader->config.lds_size, G_00B84C_LDS_SIZE(value));
-			shader->config.rsrc2 = value;
+			conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+			conf->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
-			shader->config.spi_ps_input_ena = value;
+			conf->spi_ps_input_ena = value;
 			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			shader->config.scratch_bytes_per_wave =
+			conf->scratch_bytes_per_wave =
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
@@ -3857,7 +3857,7 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 {
 	const struct radeon_shader_binary *binary = &shader->binary;
 
-	si_shader_binary_read_config(shader, 0);
+	si_shader_binary_read_config(&shader->binary, &shader->config, 0);
 
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index c892ca32803..93d5af6ff5e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -344,7 +344,8 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
-void si_shader_binary_read_config(struct si_shader *shader,
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+				  struct si_shader_config *conf,
 				  unsigned symbol_offset);
 
 #endif

From 63345cfc3a8a0f2d9bb16deef2a24cebe9045642 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Dec 2015 01:45:00 +0100
Subject: [PATCH 168/241] radeonsi: don't pass si_shader to
 si_shader_binary_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  3 ++-
 src/gallium/drivers/radeonsi/si_shader.c  | 23 ++++++++++++-----------
 src/gallium/drivers/radeonsi/si_shader.h  |  7 +++++--
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 7aedd39115a..a543c55221b 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -136,7 +136,8 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader, &sctx->b.debug,
+	si_shader_binary_read(sctx->screen, &program->shader.binary,
+			      &program->shader.config, &sctx->b.debug,
 			      TGSI_PROCESSOR_COMPUTE);
 	si_shader_binary_upload(sctx->screen, &program->shader);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ac1d3e36866..9f6f3e1f4a6 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3852,12 +3852,13 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 	}
 }
 
-void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-			   struct pipe_debug_callback *debug, unsigned processor)
+void si_shader_binary_read(struct si_screen *sscreen,
+			   struct radeon_shader_binary *binary,
+			   struct si_shader_config *conf,
+			   struct pipe_debug_callback *debug,
+			   unsigned processor)
 {
-	const struct radeon_shader_binary *binary = &shader->binary;
-
-	si_shader_binary_read_config(&shader->binary, &shader->config, 0);
+	si_shader_binary_read_config(binary, conf, 0);
 
 	if (r600_can_dump_shader(&sscreen->b, processor)) {
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
@@ -3866,15 +3867,14 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 		fprintf(stderr, "*** SHADER STATS ***\n"
 			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
 			"Scratch: %d bytes per wave\n********************\n",
-			shader->config.num_sgprs, shader->config.num_vgprs, binary->code_size,
-			shader->config.lds_size, shader->config.scratch_bytes_per_wave);
+			conf->num_sgprs, conf->num_vgprs, binary->code_size,
+			conf->lds_size, conf->scratch_bytes_per_wave);
 	}
 
 	pipe_debug_message(debug, SHADER_INFO,
 			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
-			   shader->config.num_sgprs, shader->config.num_vgprs,
-			   binary->code_size, shader->config.lds_size,
-			   shader->config.scratch_bytes_per_wave);
+			   conf->num_sgprs, conf->num_vgprs, binary->code_size,
+			   conf->lds_size, conf->scratch_bytes_per_wave);
 }
 
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
@@ -3899,7 +3899,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 			return r;
 	}
 
-	si_shader_binary_read(sscreen, shader, debug, processor);
+	si_shader_binary_read(sscreen, &shader->binary, &shader->config,
+			      debug, processor);
 
 	r = si_shader_binary_upload(sscreen, shader);
 	if (r)
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 93d5af6ff5e..b0abacc1599 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -339,8 +339,11 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-			   struct pipe_debug_callback *debug, unsigned processor);
+void si_shader_binary_read(struct si_screen *sscreen,
+			   struct radeon_shader_binary *binary,
+			   struct si_shader_config *conf,
+			   struct pipe_debug_callback *debug,
+			   unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);

From f20a76a4fd7eb176f2f3c09b7596a05be1961b75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 23:35:08 +0100
Subject: [PATCH 169/241] radeonsi: always keep shader code, rodata, and relocs
 in memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We won't compile shaders in draw calls, but we will concatenate shader
binaries according to states in draw calls, so keep the binaries.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9f6f3e1f4a6..bcc9f658a7b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3907,14 +3907,9 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		return r;
 
 	FREE(shader->binary.config);
-	FREE(shader->binary.rodata);
 	FREE(shader->binary.global_symbol_offsets);
-	if (shader->config.scratch_bytes_per_wave == 0) {
-		FREE(shader->binary.code);
-		FREE(shader->binary.relocs);
-		memset(&shader->binary, 0,
-		       offsetof(struct radeon_shader_binary, disasm_string));
-	}
+	shader->binary.config = NULL;
+	shader->binary.global_symbol_offsets = NULL;
 	return r;
 }
 
@@ -4227,6 +4222,7 @@ void si_shader_destroy(struct si_shader *shader)
 	r600_resource_reference(&shader->bo, NULL);
 
 	FREE(shader->binary.code);
+	FREE(shader->binary.rodata);
 	FREE(shader->binary.relocs);
 	FREE(shader->binary.disasm_string);
 }

From 54ed83669e637ebfe269fc88d2e3924401b2a827 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Dec 2015 23:47:00 +0100
Subject: [PATCH 170/241] radeonsi: move si_shader_binary_upload out of
 si_compile_llvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  1 +
 src/gallium/drivers/radeonsi/si_shader.c  | 12 ++++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index a543c55221b..aedea8e43ee 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -124,6 +124,7 @@ static void *si_create_compute_state(
                                                         code, header->num_bytes);
 			si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
 					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
 			LLVMDisposeModule(mod);
 		}
 	}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index bcc9f658a7b..2a206dd6290 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3902,10 +3902,6 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 	si_shader_binary_read(sscreen, &shader->binary, &shader->config,
 			      debug, processor);
 
-	r = si_shader_binary_upload(sscreen, shader);
-	if (r)
-		return r;
-
 	FREE(shader->binary.config);
 	FREE(shader->binary.global_symbol_offsets);
 	shader->binary.config = NULL;
@@ -3986,6 +3982,8 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	r = si_compile_llvm(sscreen, si_shader_ctx->shader,
 			    si_shader_ctx->tm, bld_base->base.gallivm->module,
 			    debug, TGSI_PROCESSOR_GEOMETRY);
+	if (!r)
+		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
 
 	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
 
@@ -4186,6 +4184,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		goto out;
 	}
 
+	r = si_shader_binary_upload(sscreen, shader);
+	if (r) {
+		fprintf(stderr, "LLVM failed to upload shader\n");
+		goto out;
+	}
+
 	radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
 
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {

From 5c9f104567c3b072aa103902ee5868a08ded4835 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Dec 2015 01:45:00 +0100
Subject: [PATCH 171/241] radeonsi: don't pass si_shader to si_compile_llvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  3 ++-
 src/gallium/drivers/radeonsi/si_shader.c  | 33 +++++++++++++----------
 src/gallium/drivers/radeonsi/si_shader.h  | 10 ++++---
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index aedea8e43ee..3562bd846e7 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -122,7 +122,8 @@ static void *si_create_compute_state(
 	        for (i = 0; i < program->num_kernels; i++) {
 		        LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
                                                         code, header->num_bytes);
-			si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
+			si_compile_llvm(sctx->screen, &program->kernels[i].binary,
+					&program->kernels[i].config, sctx->tm,
 					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
 			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
 			LLVMDisposeModule(mod);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 2a206dd6290..49882248300 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3877,9 +3877,13 @@ void si_shader_binary_read(struct si_screen *sscreen,
 			   conf->lds_size, conf->scratch_bytes_per_wave);
 }
 
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod,
-		    struct pipe_debug_callback *debug, unsigned processor)
+int si_compile_llvm(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    LLVMTargetMachineRef tm,
+		    LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor)
 {
 	int r = 0;
 	unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
@@ -3891,21 +3895,20 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 			LLVMDumpModule(mod);
 	}
 
-	if (!si_replace_shader(count, &shader->binary)) {
-		r = radeon_llvm_compile(mod, &shader->binary,
+	if (!si_replace_shader(count, binary)) {
+		r = radeon_llvm_compile(mod, binary,
 			r600_get_llvm_processor_name(sscreen->b.family), tm,
 			debug);
 		if (r)
 			return r;
 	}
 
-	si_shader_binary_read(sscreen, &shader->binary, &shader->config,
-			      debug, processor);
+	si_shader_binary_read(sscreen, binary, conf, debug, processor);
 
-	FREE(shader->binary.config);
-	FREE(shader->binary.global_symbol_offsets);
-	shader->binary.config = NULL;
-	shader->binary.global_symbol_offsets = NULL;
+	FREE(binary->config);
+	FREE(binary->global_symbol_offsets);
+	binary->config = NULL;
+	binary->global_symbol_offsets = NULL;
 	return r;
 }
 
@@ -3979,8 +3982,9 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	if (dump)
 		fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
 
-	r = si_compile_llvm(sscreen, si_shader_ctx->shader,
-			    si_shader_ctx->tm, bld_base->base.gallivm->module,
+	r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary,
+			    &si_shader_ctx->shader->config, si_shader_ctx->tm,
+			    bld_base->base.gallivm->module,
 			    debug, TGSI_PROCESSOR_GEOMETRY);
 	if (!r)
 		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
@@ -4178,7 +4182,8 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
 
 	mod = bld_base->base.gallivm->module;
-	r = si_compile_llvm(sscreen, shader, tm, mod, debug, si_shader_ctx.type);
+	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
+			    mod, debug, si_shader_ctx.type);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
 		goto out;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b0abacc1599..2220fc7b91b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -333,9 +333,13 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader,
 		     struct pipe_debug_callback *debug);
 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMTargetMachineRef tm, LLVMModuleRef mod,
-		    struct pipe_debug_callback *debug, unsigned processor);
+int si_compile_llvm(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    LLVMTargetMachineRef tm,
+		    LLVMModuleRef mod,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);

From ccd7d7e13d708e6f4128bc09b42dd88bd606d49b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Dec 2015 00:53:29 +0100
Subject: [PATCH 172/241] radeonsi: add si_shader_destroy_binary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 14 +++++++++-----
 src/gallium/drivers/radeonsi/si_shader.h |  1 +
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 49882248300..af32bd73d03 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4218,6 +4218,14 @@ out:
 	return r;
 }
 
+void si_shader_destroy_binary(struct radeon_shader_binary *binary)
+{
+	FREE(binary->code);
+	FREE(binary->rodata);
+	FREE(binary->relocs);
+	FREE(binary->disasm_string);
+}
+
 void si_shader_destroy(struct si_shader *shader)
 {
 	if (shader->gs_copy_shader) {
@@ -4229,9 +4237,5 @@ void si_shader_destroy(struct si_shader *shader)
 		r600_resource_reference(&shader->scratch_bo, NULL);
 
 	r600_resource_reference(&shader->bo, NULL);
-
-	FREE(shader->binary.code);
-	FREE(shader->binary.rodata);
-	FREE(shader->binary.relocs);
-	FREE(shader->binary.disasm_string);
+	si_shader_destroy_binary(&shader->binary);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 2220fc7b91b..780383c09ee 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -341,6 +341,7 @@ int si_compile_llvm(struct si_screen *sscreen,
 		    struct pipe_debug_callback *debug,
 		    unsigned processor);
 void si_shader_destroy(struct si_shader *shader);
+void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_binary_read(struct si_screen *sscreen,

From f8b34fe093594ad7c0428c55efae2f479b543a62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 3 Jan 2016 16:39:24 +0100
Subject: [PATCH 173/241] radeonsi: separate shader dumping code to
 si_shader_dump and *_dump_stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eventually, I'd like to dump stats for several combined binaries, which is
why you don't see a binary parameter in si_shader_dump_stats

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 50 ++++++++++++++++--------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index af32bd73d03..ec34bec3a79 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3852,6 +3852,39 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 	}
 }
 
+static void si_shader_dump_stats(struct si_screen *sscreen,
+			         struct si_shader_config *conf,
+				 unsigned code_size,
+			         struct pipe_debug_callback *debug,
+			         unsigned processor)
+{
+	if (r600_can_dump_shader(&sscreen->b, processor)) {
+		fprintf(stderr, "*** SHADER STATS ***\n"
+			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
+			"Scratch: %d bytes per wave\n********************\n",
+			conf->num_sgprs, conf->num_vgprs, code_size,
+			conf->lds_size, conf->scratch_bytes_per_wave);
+	}
+
+	pipe_debug_message(debug, SHADER_INFO,
+			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+			   conf->num_sgprs, conf->num_vgprs, code_size,
+			   conf->lds_size, conf->scratch_bytes_per_wave);
+}
+
+static void si_shader_dump(struct si_screen *sscreen,
+			   struct radeon_shader_binary *binary,
+			   struct si_shader_config *conf,
+			   struct pipe_debug_callback *debug,
+			   unsigned processor)
+{
+	if (r600_can_dump_shader(&sscreen->b, processor))
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
+			si_shader_dump_disassembly(binary, debug);
+
+	si_shader_dump_stats(sscreen, conf, binary->code_size, debug, processor);
+}
+
 void si_shader_binary_read(struct si_screen *sscreen,
 			   struct radeon_shader_binary *binary,
 			   struct si_shader_config *conf,
@@ -3859,22 +3892,7 @@ void si_shader_binary_read(struct si_screen *sscreen,
 			   unsigned processor)
 {
 	si_shader_binary_read_config(binary, conf, 0);
-
-	if (r600_can_dump_shader(&sscreen->b, processor)) {
-		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
-			si_shader_dump_disassembly(binary, debug);
-
-		fprintf(stderr, "*** SHADER STATS ***\n"
-			"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
-			"Scratch: %d bytes per wave\n********************\n",
-			conf->num_sgprs, conf->num_vgprs, binary->code_size,
-			conf->lds_size, conf->scratch_bytes_per_wave);
-	}
-
-	pipe_debug_message(debug, SHADER_INFO,
-			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
-			   conf->num_sgprs, conf->num_vgprs, binary->code_size,
-			   conf->lds_size, conf->scratch_bytes_per_wave);
+	si_shader_dump(sscreen, binary, conf, debug, processor);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,

From c9c031f3d0ed28a65b78748ed1e6e2ec29336451 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 3 Jan 2016 17:03:24 +0100
Subject: [PATCH 174/241] radeonsi: move si_shader_dump call out of
 si_shader_binary_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  8 +++++---
 src/gallium/drivers/radeonsi/si_shader.c  | 21 +++++++++------------
 src/gallium/drivers/radeonsi/si_shader.h  | 12 +++++++-----
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 3562bd846e7..ffa941b5f60 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -138,9 +138,11 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader.binary,
-			      &program->shader.config, &sctx->b.debug,
-			      TGSI_PROCESSOR_COMPUTE);
+	si_shader_binary_read(&program->shader.binary,
+			      &program->shader.config);
+	si_shader_dump(sctx->screen, &program->shader.binary,
+		       &program->shader.config, &sctx->b.debug,
+		       TGSI_PROCESSOR_COMPUTE);
 	si_shader_binary_upload(sctx->screen, &program->shader);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ec34bec3a79..ccb179c1f67 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3872,11 +3872,11 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
 			   conf->lds_size, conf->scratch_bytes_per_wave);
 }
 
-static void si_shader_dump(struct si_screen *sscreen,
-			   struct radeon_shader_binary *binary,
-			   struct si_shader_config *conf,
-			   struct pipe_debug_callback *debug,
-			   unsigned processor)
+void si_shader_dump(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor)
 {
 	if (r600_can_dump_shader(&sscreen->b, processor))
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
@@ -3885,14 +3885,10 @@ static void si_shader_dump(struct si_screen *sscreen,
 	si_shader_dump_stats(sscreen, conf, binary->code_size, debug, processor);
 }
 
-void si_shader_binary_read(struct si_screen *sscreen,
-			   struct radeon_shader_binary *binary,
-			   struct si_shader_config *conf,
-			   struct pipe_debug_callback *debug,
-			   unsigned processor)
+void si_shader_binary_read(struct radeon_shader_binary *binary,
+			   struct si_shader_config *conf)
 {
 	si_shader_binary_read_config(binary, conf, 0);
-	si_shader_dump(sscreen, binary, conf, debug, processor);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
@@ -3921,7 +3917,8 @@ int si_compile_llvm(struct si_screen *sscreen,
 			return r;
 	}
 
-	si_shader_binary_read(sscreen, binary, conf, debug, processor);
+	si_shader_binary_read(binary, conf);
+	si_shader_dump(sscreen, binary, conf, debug, processor);
 
 	FREE(binary->config);
 	FREE(binary->global_symbol_offsets);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 780383c09ee..51dfcd09e9e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -344,11 +344,13 @@ void si_shader_destroy(struct si_shader *shader);
 void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-void si_shader_binary_read(struct si_screen *sscreen,
-			   struct radeon_shader_binary *binary,
-			   struct si_shader_config *conf,
-			   struct pipe_debug_callback *debug,
-			   unsigned processor);
+void si_shader_binary_read(struct radeon_shader_binary *binary,
+			   struct si_shader_config *conf);
+void si_shader_dump(struct si_screen *sscreen,
+		    struct radeon_shader_binary *binary,
+		    struct si_shader_config *conf,
+		    struct pipe_debug_callback *debug,
+		    unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);

From b0df5f4c19f2c084fe65b13f5712433c91ad83d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 3 Jan 2016 17:05:05 +0100
Subject: [PATCH 175/241] radeonsi: inline si_shader_binary_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 4 ++--
 src/gallium/drivers/radeonsi/si_shader.c  | 8 +-------
 src/gallium/drivers/radeonsi/si_shader.h  | 2 --
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index ffa941b5f60..2380242a7fa 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -138,8 +138,8 @@ static void *si_create_compute_state(
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(&program->shader.binary,
-			      &program->shader.config);
+	si_shader_binary_read_config(&program->shader.binary,
+				     &program->shader.config, 0);
 	si_shader_dump(sctx->screen, &program->shader.binary,
 		       &program->shader.config, &sctx->b.debug,
 		       TGSI_PROCESSOR_COMPUTE);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ccb179c1f67..36b0364a204 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3885,12 +3885,6 @@ void si_shader_dump(struct si_screen *sscreen,
 	si_shader_dump_stats(sscreen, conf, binary->code_size, debug, processor);
 }
 
-void si_shader_binary_read(struct radeon_shader_binary *binary,
-			   struct si_shader_config *conf)
-{
-	si_shader_binary_read_config(binary, conf, 0);
-}
-
 int si_compile_llvm(struct si_screen *sscreen,
 		    struct radeon_shader_binary *binary,
 		    struct si_shader_config *conf,
@@ -3917,7 +3911,7 @@ int si_compile_llvm(struct si_screen *sscreen,
 			return r;
 	}
 
-	si_shader_binary_read(binary, conf);
+	si_shader_binary_read_config(binary, conf, 0);
 	si_shader_dump(sscreen, binary, conf, debug, processor);
 
 	FREE(binary->config);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 51dfcd09e9e..712bcd9075d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -344,8 +344,6 @@ void si_shader_destroy(struct si_shader *shader);
 void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-void si_shader_binary_read(struct radeon_shader_binary *binary,
-			   struct si_shader_config *conf);
 void si_shader_dump(struct si_screen *sscreen,
 		    struct radeon_shader_binary *binary,
 		    struct si_shader_config *conf,

From 0a51b010e5a250bd533c95447f6f2d8132345e67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 3 Jan 2016 17:18:04 +0100
Subject: [PATCH 176/241] radeonsi: move si_shader_dump call out of
 si_compile_llvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  3 +++
 src/gallium/drivers/radeonsi/si_shader.c  | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 2380242a7fa..ffac6560527 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -125,6 +125,9 @@ static void *si_create_compute_state(
 			si_compile_llvm(sctx->screen, &program->kernels[i].binary,
 					&program->kernels[i].config, sctx->tm,
 					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+			si_shader_dump(sctx->screen, &program->kernels[i].binary,
+				       &program->kernels[i].config,
+				       &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
 			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
 			LLVMDisposeModule(mod);
 		}
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 36b0364a204..500dd2b60c5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3912,7 +3912,6 @@ int si_compile_llvm(struct si_screen *sscreen,
 	}
 
 	si_shader_binary_read_config(binary, conf, 0);
-	si_shader_dump(sscreen, binary, conf, debug, processor);
 
 	FREE(binary->config);
 	FREE(binary->global_symbol_offsets);
@@ -3995,8 +3994,12 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 			    &si_shader_ctx->shader->config, si_shader_ctx->tm,
 			    bld_base->base.gallivm->module,
 			    debug, TGSI_PROCESSOR_GEOMETRY);
-	if (!r)
+	if (!r) {
+		si_shader_dump(sscreen, &si_shader_ctx->shader->binary,
+			       &si_shader_ctx->shader->config, debug,
+			       TGSI_PROCESSOR_GEOMETRY);
 		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
+	}
 
 	radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
 
@@ -4198,6 +4201,9 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		goto out;
 	}
 
+	si_shader_dump(sscreen, &shader->binary, &shader->config,
+		       debug, si_shader_ctx.type);
+
 	r = si_shader_binary_upload(sscreen, shader);
 	if (r) {
 		fprintf(stderr, "LLVM failed to upload shader\n");

From bca18057a359f98b5db0a6453abe4dc7dd70a31d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 6 Jan 2016 02:30:13 +0100
Subject: [PATCH 177/241] radeonsi: adjust the parameters of si_shader_dump
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function will be extended to dump all binaries shaders will consist of,
so si_shader* makes sense here.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c |  6 ++----
 src/gallium/drivers/radeonsi/si_shader.c  | 18 +++++++-----------
 src/gallium/drivers/radeonsi/si_shader.h  |  7 ++-----
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index ffac6560527..5a08cbfb198 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -125,8 +125,7 @@ static void *si_create_compute_state(
 			si_compile_llvm(sctx->screen, &program->kernels[i].binary,
 					&program->kernels[i].config, sctx->tm,
 					mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
-			si_shader_dump(sctx->screen, &program->kernels[i].binary,
-				       &program->kernels[i].config,
+			si_shader_dump(sctx->screen, &program->kernels[i],
 				       &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
 			si_shader_binary_upload(sctx->screen, &program->kernels[i]);
 			LLVMDisposeModule(mod);
@@ -143,8 +142,7 @@ static void *si_create_compute_state(
 	init_scratch_buffer(sctx, program);
 	si_shader_binary_read_config(&program->shader.binary,
 				     &program->shader.config, 0);
-	si_shader_dump(sctx->screen, &program->shader.binary,
-		       &program->shader.config, &sctx->b.debug,
+	si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
 		       TGSI_PROCESSOR_COMPUTE);
 	si_shader_binary_upload(sctx->screen, &program->shader);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 500dd2b60c5..97645315049 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3872,17 +3872,15 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
 			   conf->lds_size, conf->scratch_bytes_per_wave);
 }
 
-void si_shader_dump(struct si_screen *sscreen,
-		    struct radeon_shader_binary *binary,
-		    struct si_shader_config *conf,
-		    struct pipe_debug_callback *debug,
-		    unsigned processor)
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+		    struct pipe_debug_callback *debug, unsigned processor)
 {
 	if (r600_can_dump_shader(&sscreen->b, processor))
 		if (!(sscreen->b.debug_flags & DBG_NO_ASM))
-			si_shader_dump_disassembly(binary, debug);
+			si_shader_dump_disassembly(&shader->binary, debug);
 
-	si_shader_dump_stats(sscreen, conf, binary->code_size, debug, processor);
+	si_shader_dump_stats(sscreen, &shader->config,
+			     shader->binary.code_size, debug, processor);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
@@ -3995,8 +3993,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 			    bld_base->base.gallivm->module,
 			    debug, TGSI_PROCESSOR_GEOMETRY);
 	if (!r) {
-		si_shader_dump(sscreen, &si_shader_ctx->shader->binary,
-			       &si_shader_ctx->shader->config, debug,
+		si_shader_dump(sscreen, si_shader_ctx->shader, debug,
 			       TGSI_PROCESSOR_GEOMETRY);
 		r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
 	}
@@ -4201,8 +4198,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		goto out;
 	}
 
-	si_shader_dump(sscreen, &shader->binary, &shader->config,
-		       debug, si_shader_ctx.type);
+	si_shader_dump(sscreen, shader, debug, si_shader_ctx.type);
 
 	r = si_shader_binary_upload(sscreen, shader);
 	if (r) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 712bcd9075d..1635358d505 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -344,11 +344,8 @@ void si_shader_destroy(struct si_shader *shader);
 void si_shader_destroy_binary(struct radeon_shader_binary *binary);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-void si_shader_dump(struct si_screen *sscreen,
-		    struct radeon_shader_binary *binary,
-		    struct si_shader_config *conf,
-		    struct pipe_debug_callback *debug,
-		    unsigned processor);
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+		    struct pipe_debug_callback *debug, unsigned processor);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);

From f41b6cfb07ede2be053c57e38d4d6b9433f90bf1 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 7 Jan 2016 19:50:12 +0200
Subject: [PATCH 178/241] llvmpipe: use sse2 conv code for altivec

In lp_build_conv() and lp_build_conv_auto(), there is a special case of
conversion when sse2 is present. That code path is suitable without any
changes to altivec, because all the functions that are called in that
code path already support altivec.

This patch increase the FPS in POWER arch across the board
between 10%-25%

I checked ipers, glxgears, glxspheres64, openarena, xonotic and glmark2.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 14244470c90..7854142f736 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -458,7 +458,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
    {
       /* Special case 4x4f --> 1x16ub */
       if (src_type.length == 4 &&
-          util_cpu_caps.has_sse2)
+            (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
       {
          num_dsts = (num_srcs + 3) / 4;
          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
@@ -545,7 +545,7 @@ lp_build_conv(struct gallivm_state *gallivm,
        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 
-       util_cpu_caps.has_sse2)
+       (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
    {
       struct lp_build_context bld;
       struct lp_type int16_type, int32_type;

From 6aed083b9304cd718ee5bc7839a6222b982d3e3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 5 Jan 2016 21:47:04 -0500
Subject: [PATCH 179/241] mesa/bufferobj: make _mesa_delete_buffer_object
 externally accessible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gl_buffer_object has grown more complicated and requires cleanup. Using this
function from drivers will be more future-proof.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/bufferobj.c | 2 +-
 src/mesa/main/bufferobj.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 3a05cd55042..a1e47d62773 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -447,7 +447,7 @@ _mesa_new_buffer_object(struct gl_context *ctx, GLuint name)
  *
  * Default callback for the \c dd_function_table::DeleteBuffer() hook.
  */
-static void
+void
 _mesa_delete_buffer_object(struct gl_context *ctx,
                            struct gl_buffer_object *bufObj)
 {
diff --git a/src/mesa/main/bufferobj.h b/src/mesa/main/bufferobj.h
index 3eac96df23e..a5bfe886b39 100644
--- a/src/mesa/main/bufferobj.h
+++ b/src/mesa/main/bufferobj.h
@@ -108,6 +108,10 @@ _mesa_initialize_buffer_object(struct gl_context *ctx,
                                struct gl_buffer_object *obj,
                                GLuint name);
 
+extern void
+_mesa_delete_buffer_object(struct gl_context *ctx,
+                           struct gl_buffer_object *bufObj);
+
 extern void
 _mesa_reference_buffer_object_(struct gl_context *ctx,
                                struct gl_buffer_object **ptr,

From 1c2187b1c225b2f7e1891544d184bde60390977e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 5 Jan 2016 21:49:11 -0500
Subject: [PATCH 180/241] st/mesa: use _mesa_delete_buffer_object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is more future-proof than the current code.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/state_tracker/st_cb_bufferobjects.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 5d20b26d26e..c75f4765b94 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -83,9 +83,7 @@ st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj)
    if (st_obj->buffer)
       pipe_resource_reference(&st_obj->buffer, NULL);
 
-   mtx_destroy(&st_obj->Base.Mutex);
-   free(st_obj->Base.Label);
-   free(st_obj);
+   _mesa_delete_buffer_object(ctx, obj);
 }
 
 

From 8882b46226152733960ae006e3856baf00aa71f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 5 Jan 2016 21:49:37 -0500
Subject: [PATCH 181/241] radeon: use _mesa_delete_buffer_object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is more future-proof, plugs the memory leak of Label and properly
destroys the buffer mutex.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/drivers/dri/radeon/radeon_buffer_objects.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
index d9d4f5ffc5e..2b76305dd45 100644
--- a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
@@ -71,7 +71,7 @@ radeonDeleteBufferObject(struct gl_context * ctx,
         radeon_bo_unref(radeon_obj->bo);
     }
 
-    free(radeon_obj);
+    _mesa_delete_buffer_object(ctx, obj);
 }
 
 

From 1b74c02e83c59a51f155b64de0444ea3df183af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 5 Jan 2016 21:51:13 -0500
Subject: [PATCH 182/241] i915: use _mesa_delete_buffer_object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is more future-proof, plugs the memory leak of Label and properly
destroys the buffer mutex.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/drivers/dri/i915/intel_buffer_objects.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i915/intel_buffer_objects.c b/src/mesa/drivers/dri/i915/intel_buffer_objects.c
index ef06743ed49..e6760964909 100644
--- a/src/mesa/drivers/dri/i915/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i915/intel_buffer_objects.c
@@ -99,7 +99,7 @@ intel_bufferobj_free(struct gl_context * ctx, struct gl_buffer_object *obj)
    _mesa_align_free(intel_obj->sys_buffer);
 
    drm_intel_bo_unreference(intel_obj->buffer);
-   free(intel_obj);
+   _mesa_delete_buffer_object(ctx, obj);
 }
 
 

From 051603efd546efea9975a5109910171a2e7853a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 5 Jan 2016 21:51:27 -0500
Subject: [PATCH 183/241] i965: use _mesa_delete_buffer_object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is more future-proof, plugs the memory leak of Label and properly
destroys the buffer mutex.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/drivers/dri/i965/intel_buffer_objects.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 7a5b3fca595..56da2da08a8 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -167,7 +167,7 @@ brw_delete_buffer(struct gl_context * ctx, struct gl_buffer_object *obj)
    _mesa_buffer_unmap_all_mappings(ctx, obj);
 
    drm_intel_bo_unreference(intel_obj->buffer);
-   free(intel_obj);
+   _mesa_delete_buffer_object(ctx, obj);
 }
 
 

From 5cf156c6b42c03391429ef08542363f7574fd0c9 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Tue, 14 Jul 2015 23:30:27 +1000
Subject: [PATCH 184/241] glsl: replace null check with assert

This was added in 54f583a20 since then error handling has improved.

The test this was added to fix now fails earlier since 01822706ec

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/ir_constant_expression.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 5bf5ce54f78..f02e959bd18 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -1824,9 +1824,7 @@ ir_swizzle::constant_expression_value(struct hash_table *variable_context)
 ir_constant *
 ir_dereference_variable::constant_expression_value(struct hash_table *variable_context)
 {
-   /* This may occur during compile and var->type is glsl_type::error_type */
-   if (!var)
-      return NULL;
+   assert(var);
 
    /* Give priority to the context hashtable, if it exists */
    if (variable_context) {

From 64da11f05294551a248f395db168f4941f969b45 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 6 Jan 2016 23:49:30 +0100
Subject: [PATCH 185/241] draw: fix line stippling with unfilled prims

The unfilled stage was not filling in the prim header, and the line stage
then decided to reset the stipple counter or not based on the uninitialized
data. This causes some failures in conform linestipple test (albeit quite
randomly happening depending on environment).
So fill in the prim header in the unfilled stage - I am not entirely sure
if anybody really needs determinant after that stage, but there's at least
later stages (wide line for instance) which copy over the determinant as well.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 .../auxiliary/draw/draw_pipe_unfilled.c       | 56 +++++++++++++------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 2517d610e71..c465c7526f5 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -86,27 +86,33 @@ inject_front_face_info(struct draw_stage *stage,
 }
 
    
-static void point( struct draw_stage *stage,
-		   struct vertex_header *v0 )
+static void point(struct draw_stage *stage,
+                  struct prim_header *header,
+                  struct vertex_header *v0)
 {
    struct prim_header tmp;
+   tmp.det = header->det;
+   tmp.flags = 0;
    tmp.v[0] = v0;
-   stage->next->point( stage->next, &tmp );
+   stage->next->point(stage->next, &tmp);
 }
 
-static void line( struct draw_stage *stage,
-		  struct vertex_header *v0,
-		  struct vertex_header *v1 )
+static void line(struct draw_stage *stage,
+                 struct prim_header *header,
+                 struct vertex_header *v0,
+                 struct vertex_header *v1)
 {
    struct prim_header tmp;
+   tmp.det = header->det;
+   tmp.flags = 0;
    tmp.v[0] = v0;
    tmp.v[1] = v1;
-   stage->next->line( stage->next, &tmp );
+   stage->next->line(stage->next, &tmp);
 }
 
 
-static void points( struct draw_stage *stage,
-		    struct prim_header *header )
+static void points(struct draw_stage *stage,
+                   struct prim_header *header)
 {
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
@@ -114,27 +120,41 @@ static void points( struct draw_stage *stage,
 
    inject_front_face_info(stage, header);
 
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) point( stage, v0 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) point( stage, v1 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) point( stage, v2 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag)
+      point(stage, header, v0);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag)
+      point(stage, header, v1);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag)
+      point(stage, header, v2);
 }
 
 
-static void lines( struct draw_stage *stage,
-		   struct prim_header *header )
+static void lines(struct draw_stage *stage,
+                  struct prim_header *header)
 {
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
    struct vertex_header *v2 = header->v[2];
 
    if (header->flags & DRAW_PIPE_RESET_STIPPLE)
-      stage->next->reset_stipple_counter( stage->next );
+      /*
+       * XXX could revisit this. The only stage which cares is the line
+       * stipple stage. Could just emit correct reset flags here and not
+       * bother about all the calling through reset_stipple_counter
+       * stages. Though technically it is necessary if line stipple is
+       * handled by the driver, but this is not actually hooked up when
+       * using vbuf (vbuf stage reset_stipple_counter does nothing).
+       */
+      stage->next->reset_stipple_counter(stage->next);
 
    inject_front_face_info(stage, header);
 
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) line( stage, v2, v0 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) line( stage, v0, v1 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) line( stage, v1, v2 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag)
+      line(stage, header, v2, v0);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag)
+      line(stage, header, v0, v1);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag)
+      line(stage, header, v1, v2);
 }
 
 

From 9db7309595309ff0835c701108827ffc8c906751 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 7 Jan 2016 19:38:15 +0100
Subject: [PATCH 186/241] draw: initialize prim header flags when clipping
 lines

Otherwise, clipped lines would have undefined stippling reset bit if line
stippling is enabled.
(Untested, and I just assume copying over the bits from the original line
is actually the right thing to do.)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/draw/draw_pipe_clip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 67d8ecaa35f..2d92d650ab6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -611,6 +611,8 @@ do_clip_line(struct draw_stage *stage,
    struct prim_header newprim;
    int viewport_index;
 
+   newprim.flags = header->flags;
+
    if (stage->draw->rasterizer->flatshade_first) {
       prov_vertex = v0;
    }

From b61b9a377edb566af2f015c159f5f8779d9b27d9 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 31 Dec 2015 03:20:38 +0100
Subject: [PATCH 187/241] llvmpipe: use aligned data for the assembly program
 in setup

Back in the day (before 24678700edaf5bb9da9be93a1367f1a24cfaa471) the values
were not actually in a struct but even then I can't see why we didn't simply
align the values. Especially since it's trivial to do so.
(Not that it actually matters since the code is pretty much unused for now.)

Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 38 ++++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 0ff10a2027d..592412fe346 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -390,6 +390,10 @@ do_triangle_ccw(struct lp_setup_context *setup,
    plane = GET_PLANES(tri);
 
 #if defined(PIPE_ARCH_SSE)
+   /*
+    * XXX this code is effectively disabled for all practical purposes,
+    * as the allowed fb size is tiny if FIXED_ORDER is 8.
+    */
    if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
        setup->fb.height <= MAX_FIXED_LENGTH32 &&
        (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
@@ -407,8 +411,8 @@ do_triangle_ccw(struct lp_setup_context *setup,
       __m128i zero = _mm_setzero_si128();
       PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
 
-      vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */
-      verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */
+      vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
+      verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
 
       shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
       shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
@@ -1032,12 +1036,12 @@ rotate_fixed_position_12( struct fixed_position* position )
 /**
  * Draw triangle if it's CW, cull otherwise.
  */
-static void triangle_cw( struct lp_setup_context *setup,
-			 const float (*v0)[4],
-			 const float (*v1)[4],
-			 const float (*v2)[4] )
+static void triangle_cw(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -1053,12 +1057,12 @@ static void triangle_cw( struct lp_setup_context *setup,
 }
 
 
-static void triangle_ccw( struct lp_setup_context *setup,
-                          const float (*v0)[4],
-                          const float (*v1)[4],
-                          const float (*v2)[4])
+static void triangle_ccw(struct lp_setup_context *setup,
+                         const float (*v0)[4],
+                         const float (*v1)[4],
+                         const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -1069,12 +1073,12 @@ static void triangle_ccw( struct lp_setup_context *setup,
 /**
  * Draw triangle whether it's CW or CCW.
  */
-static void triangle_both( struct lp_setup_context *setup,
-			   const float (*v0)[4],
-			   const float (*v1)[4],
-			   const float (*v2)[4] )
+static void triangle_both(struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
 {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
    struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
 
    if (lp_context->active_statistics_queries &&

From fad283ba9e691d0d5d170f388e75542f2c39e559 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 2 Jan 2016 04:58:37 +0100
Subject: [PATCH 188/241] llvmpipe: don't store eo as 64bit int

eo, just like dcdx and dcdy, cannot overflow 32bit.
Store it as unsigned though just in case (it cannot be negative, but
in theory twice as big as dcdx or dcdy so this gives it one more bit).
This doesn't really change anything, albeit it might help minimally on
32bit archs.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_rast.h         |  2 +-
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h |  4 ++--
 src/gallium/drivers/llvmpipe/lp_setup.c        |  5 +++++
 src/gallium/drivers/llvmpipe/lp_setup_tri.c    | 16 ++++++++--------
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c19f9318006..db45cbbb057 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -115,7 +115,7 @@ struct lp_rast_plane {
    int32_t dcdy;
 
    /* one-pixel sized trivial reject offsets for each plane */
-   int64_t eo;
+   uint32_t eo;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 52f6e999683..e0aea94205e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
       const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
       const int64_t cox = IMUL64(plane[j].eo, 4);
-      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
       const int64_t cio = IMUL64(ei, 4) - 1;
 
       BUILD_MASKS(c[j] + cox,
@@ -182,7 +182,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
          const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
          const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
          const int64_t cox = IMUL64(plane[j].eo, 16);
-         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
          const int64_t cio = IMUL64(ei, 16) - 1;
 
          BUILD_MASKS(c[j] + cox,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index ddbb88eb107..bd850519468 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -486,6 +486,11 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
                                    depth,
                                    stencil);
 
+   /*
+    * XXX: should make a full mask here for things like D24X8,
+    * otherwise we'll do a read-modify-write clear later which
+    * should be unnecessary.
+    */
    zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
                                        zmask32,
                                        smask8);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 592412fe346..a1631fdaee9 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -621,19 +621,19 @@ do_triangle_ccw(struct lp_setup_context *setup,
    }
 
    if (0) {
-      debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+      debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[0].c,
                    plane[0].dcdx,
                    plane[0].dcdy,
                    plane[0].eo);
-      
-      debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[1].c,
                    plane[1].dcdx,
                    plane[1].dcdy,
                    plane[1].eo);
-      
-      debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n",
                    plane[2].c,
                    plane[2].dcdx,
                    plane[2].dcdy,
@@ -694,7 +694,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 static inline uint32_t 
 floor_pot(uint32_t n)
 {
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
    if (n == 0)
       return 0;
 
@@ -842,9 +842,9 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
 
          ei[i] = (plane[i].dcdy - 
                   plane[i].dcdx - 
-                  plane[i].eo) << TILE_ORDER;
+                  (int64_t)plane[i].eo) << TILE_ORDER;
 
-         eo[i] = plane[i].eo << TILE_ORDER;
+         eo[i] = (int64_t)plane[i].eo << TILE_ORDER;
          xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
          ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
       }

From 2923c7a0ed92a29da029183356e81ad55e615cf7 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 2 Jan 2016 04:59:09 +0100
Subject: [PATCH 189/241] llvmpipe: do 64bit plane calculations in the sse path

The sse path was pretty much disabled for practical purposes because the
largest allowed fb size was 128x128. So, adapt it for 64bit plane calculations.
This is actually not that difficult, though a problem is that we can't do
a signed 32x32->64bit mul, only unsigned, so need to fix that up. Overall,
the code still looks reasonable, though it's not like changes there in
setup really make much of a difference in the end...

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/util/u_sse.h           |  90 ++++++++++++++--
 src/gallium/drivers/llvmpipe/lp_setup_line.c |  16 ++-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c  | 102 +++++++++++--------
 3 files changed, 148 insertions(+), 60 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 7f8e5a1a3cf..cae4138ba01 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -166,14 +166,49 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
+/*
+ * Provide an SSE implementation of _mm_mul_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * Basically, albeit surprising at first (and second, and third...) look
+ * if a * b is done signed instead of unsigned, can just
+ * subtract b from the high bits of the result if a is negative
+ * (and the same for a if b is negative). Modular arithmetic at its best!
+ *
+ * So for int32 a,b in crude pseudo-code ("*" here denoting a widening mul)
+ * fixupb = (signmask(b) & a) << 32ULL
+ * fixupa = (signmask(a) & b) << 32ULL
+ * a * b = (unsigned)a * (unsigned)b - fixupb - fixupa
+ * = (unsigned)a * (unsigned)b -(fixupb + fixupa)
+ *
+ * This does both lo (dwords 0/2) and hi parts (1/3) at the same time due
+ * to some optimization potential.
+ */
+static inline __m128i
+mm_mullohi_epi32(const __m128i a, const __m128i b, __m128i *res13)
+{
+   __m128i a13, b13, mul02, mul13;
+   __m128i anegmask, bnegmask, fixup, fixup02, fixup13;
+   a13 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2,3,0,1));
+   b13 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2,3,0,1));
+   anegmask = _mm_srai_epi32(a, 31);
+   bnegmask = _mm_srai_epi32(b, 31);
+   fixup = _mm_add_epi32(_mm_and_si128(anegmask, b),
+                         _mm_and_si128(bnegmask, a));
+   mul02 = _mm_mul_epu32(a, b);
+   mul13 = _mm_mul_epu32(a13, b13);
+   fixup02 = _mm_slli_epi64(fixup, 32);
+   fixup13 = _mm_and_si128(fixup, _mm_set_epi32(-1,0,-1,0));
+   *res13 = _mm_sub_epi64(mul13, fixup13);
+   return _mm_sub_epi64(mul02, fixup02);
+}
 
 
 /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
  * _mm_mul_epu32().
  *
- * I suspect this works fine for us because one of our operands is
- * always positive, but not sure that this can be used for general
- * signed integer multiplication.
+ * This always works regardless the signs of the operands, since
+ * the high bits (which would be different) aren't used.
  *
  * This seems close enough to the speed of SSE4 and the real
  * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
@@ -188,6 +223,12 @@ static inline __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 
    /* Interleave the results, either with shuffles or (slightly
     * faster) direct bit operations:
+    * XXX: might be only true for some cpus (in particular 65nm
+    * Core 2). On most cpus (including that Core 2, but not Nehalem...)
+    * using _mm_shuffle_ps/_mm_shuffle_epi32 might also be faster
+    * than using the 3 instructions below. But logic should be fine
+    * as well, we can't have optimal solution for all cpus (if anything,
+    * should just use _mm_mullo_epi32() if sse41 is available...).
     */
 #if 0
    __m128i ba8             = _mm_shuffle_epi32(ba, 8);
@@ -214,17 +255,44 @@ transpose4_epi32(const __m128i * restrict a,
                  __m128i * restrict q,
                  __m128i * restrict r)
 {
-  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
-  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
-  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
-  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+   __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
 
-  *o = _mm_unpacklo_epi64(t0, t1);
-  *p = _mm_unpackhi_epi64(t0, t1);
-  *q = _mm_unpacklo_epi64(t2, t3);
-  *r = _mm_unpackhi_epi64(t2, t3);
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
 }
 
+
+/*
+ * Same as above, except the first two values are already interleaved
+ * (i.e. contain 64bit values).
+ */
+static inline void
+transpose2_64_2_32(const __m128i * restrict a01,
+                   const __m128i * restrict a23,
+                   const __m128i * restrict c,
+                   const __m128i * restrict d,
+                   __m128i * restrict o,
+                   __m128i * restrict p,
+                   __m128i * restrict q,
+                   __m128i * restrict r)
+{
+   __m128i t0 = *a01;
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = *a23;
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+
 #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index fac1cd61d77..a0de599c9c6 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -644,19 +644,25 @@ try_setup_line( struct lp_setup_context *setup,
    line->inputs.layer = layer;
    line->inputs.viewport_index = viewport_index;
 
+   /*
+    * XXX: this code is mostly identical to the one in lp_setup_tri, except it
+    * uses 4 planes instead of 3. Could share the code (including the sse
+    * assembly, in fact we'd get the 4th plane for free).
+    * The only difference apart from storing the 4th plane would be some
+    * different shuffle for calculating dcdx/dcdy.
+    */
    for (i = 0; i < 4; i++) {
 
-      /* half-edge constants, will be interated over the whole render
+      /* half-edge constants, will be iterated over the whole render
        * target.
        */
       plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
 
-      
-      /* correct for top-left vs. bottom-left fill convention.  
-       */         
+      /* correct for top-left vs. bottom-left fill convention.
+       */
       if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane[i].c++;            
+         plane[i].c++;
       }
       else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index a1631fdaee9..358da442ea7 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -390,26 +390,18 @@ do_triangle_ccw(struct lp_setup_context *setup,
    plane = GET_PLANES(tri);
 
 #if defined(PIPE_ARCH_SSE)
-   /*
-    * XXX this code is effectively disabled for all practical purposes,
-    * as the allowed fb size is tiny if FIXED_ORDER is 8.
-    */
-   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
-       setup->fb.height <= MAX_FIXED_LENGTH32 &&
-       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
-       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+   if (1) {
       __m128i vertx, verty;
       __m128i shufx, shufy;
-      __m128i dcdx, dcdy, c;
-      __m128i unused;
+      __m128i dcdx, dcdy;
+      __m128i cdx02, cdx13, cdy02, cdy13, c02, c13;
+      __m128i c01, c23, unused;
       __m128i dcdx_neg_mask;
       __m128i dcdy_neg_mask;
       __m128i dcdx_zero_mask;
-      __m128i top_left_flag;
-      __m128i c_inc_mask, c_inc;
+      __m128i top_left_flag, c_dec;
       __m128i eo, p0, p1, p2;
       __m128i zero = _mm_setzero_si128();
-      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
 
       vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
       verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
@@ -426,48 +418,70 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
       top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
 
-      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
-                                _mm_and_si128(dcdx_zero_mask,
-                                              _mm_xor_si128(dcdy_neg_mask,
-                                                            top_left_flag)));
+      c_dec = _mm_or_si128(dcdx_neg_mask,
+                           _mm_and_si128(dcdx_zero_mask,
+                                         _mm_xor_si128(dcdy_neg_mask,
+                                                       top_left_flag)));
 
-      c_inc = _mm_srli_epi32(c_inc_mask, 31);
+      /*
+       * 64 bit arithmetic.
+       * Note we need _signed_ mul (_mm_mul_epi32) which we emulate.
+       */
+      cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13);
+      cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13);
+      c02 = _mm_sub_epi64(cdx02, cdy02);
+      c13 = _mm_sub_epi64(cdx13, cdy13);
+      c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(2,2,0,0)));
+      c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(3,3,1,1)));
 
-      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
-                        mm_mullo_epi32(dcdy, verty));
-
-      c = _mm_add_epi32(c, c_inc);
+      /*
+       * Useful for very small fbs/tris (or fewer subpixel bits) only:
+       * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+       *                   mm_mullo_epi32(dcdy, verty));
+       *
+       * c = _mm_sub_epi32(c, c_dec);
+       */
 
       /* Scale up to match c:
        */
       dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
       dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
 
-      /* Calculate trivial reject values:
+      /*
+       * Calculate trivial reject values:
+       * Note eo cannot overflow even if dcdx/dcdy would already have
+       * 31 bits (which they shouldn't have). This is because eo
+       * is never negative (albeit if we rely on that need to be careful...)
        */
       eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
                          _mm_and_si128(dcdx_neg_mask, dcdx));
 
       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
 
-      /* Pointless transpose which gets undone immediately in
-       * rasterization:
+      /*
+       * Pointless transpose which gets undone immediately in
+       * rasterization.
+       * It is actually difficult to do away with it - would essentially
+       * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations
+       * for this then would need to depend on the number of planes.
+       * The transpose is quite special here due to c being 64bit...
+       * The store has to be unaligned (unless we'd make the plane size
+       * a multiple of 128), and of course storing eo separately...
        */
-      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
-                       &p0, &p1, &p2, &unused);
-
-#define STORE_PLANE(plane, vec) do {                 \
-         _mm_store_si128((__m128i *)&temp_vec, vec); \
-         plane.c    = (int64_t)temp_vec[0];          \
-         plane.dcdx = temp_vec[1];                   \
-         plane.dcdy = temp_vec[2];                   \
-         plane.eo   = temp_vec[3];                   \
-      } while(0)
-
-      STORE_PLANE(plane[0], p0);
-      STORE_PLANE(plane[1], p1);
-      STORE_PLANE(plane[2], p2);
-#undef STORE_PLANE
+      c01 = _mm_unpacklo_epi64(c02, c13);
+      c23 = _mm_unpackhi_epi64(c02, c13);
+      transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy,
+                         &p0, &p1, &p2, &unused);
+      _mm_storeu_si128((__m128i *)&plane[0], p0);
+      plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[1], p1);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1));
+      plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[2], p2);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2));
+      plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo);
    } else
 #elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
    /*
@@ -577,17 +591,17 @@ do_triangle_ccw(struct lp_setup_context *setup,
       plane[2].dcdx = position->dy20;
   
       for (i = 0; i < 3; i++) {
-         /* half-edge constants, will be interated over the whole render
+         /* half-edge constants, will be iterated over the whole render
           * target.
           */
          plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
-               IMUL64(plane[i].dcdy, position->y[i]);
+                      IMUL64(plane[i].dcdy, position->y[i]);
 
          /* correct for top-left vs. bottom-left fill convention.
-          */         
+          */
          if (plane[i].dcdx < 0) {
             /* both fill conventions want this - adjust for left edges */
-            plane[i].c++;            
+            plane[i].c++;
          }
          else if (plane[i].dcdx == 0) {
             if (setup->bottom_edge_rule == 0){

From 60d0cfd4298d12d004e5f07ee5f94661ce0cc80f Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 30 Dec 2015 18:10:56 -0500
Subject: [PATCH 190/241] vbo: create a new draw function interface for
 indirect draws
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All indirect draws are passed to the new draw function. By default
there's a fallback implementation which pipes it right back to
draw_prims, but eventually both the fallback and draw_prim's support for
indirect drawing should be removed.

This should allow a backend to properly support ARB_multi_draw_indirect
and ARB_indirect_parameters.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Acked-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/vbo/vbo.h            | 15 ++++++
 src/mesa/vbo/vbo_context.c    | 50 +++++++++++++++++++
 src/mesa/vbo/vbo_context.h    |  6 +++
 src/mesa/vbo/vbo_exec_array.c | 93 +++++++----------------------------
 4 files changed, 89 insertions(+), 75 deletions(-)

diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index dd9b428b104..0b8b6a9de56 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -110,6 +110,18 @@ typedef void (*vbo_draw_func)( struct gl_context *ctx,
 			       struct gl_buffer_object *indirect);
 
 
+typedef void (*vbo_indirect_draw_func)(
+   struct gl_context *ctx,
+   GLuint mode,
+   struct gl_buffer_object *indirect_data,
+   GLsizeiptr indirect_offset,
+   unsigned draw_count,
+   unsigned stride,
+   struct gl_buffer_object *indirect_params,
+   GLsizeiptr indirect_params_offset,
+   const struct _mesa_index_buffer *ib);
+
+
 
 
 /* Utility function to cope with various constraints on tnl modules or
@@ -179,6 +191,9 @@ void vbo_always_unmap_buffers(struct gl_context *ctx);
 
 void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func);
 
+void vbo_set_indirect_draw_func(struct gl_context *ctx,
+                                vbo_indirect_draw_func func);
+
 void vbo_check_buffers_are_unmapped(struct gl_context *ctx);
 
 void vbo_bind_arrays(struct gl_context *ctx);
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index 19b35a429b3..9f807a17512 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -135,6 +135,48 @@ static void init_mat_currval(struct gl_context *ctx)
    }
 }
 
+static void
+vbo_draw_indirect_prims(struct gl_context *ctx,
+                        GLuint mode,
+                        struct gl_buffer_object *indirect_data,
+                        GLsizeiptr indirect_offset,
+                        unsigned draw_count,
+                        unsigned stride,
+                        struct gl_buffer_object *indirect_params,
+                        GLsizeiptr indirect_params_offset,
+                        const struct _mesa_index_buffer *ib)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct _mesa_prim *prim;
+   GLsizei i;
+
+   prim = calloc(draw_count, sizeof(*prim));
+   if (prim == NULL) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDraw%sIndirect%s",
+                  (draw_count > 1) ? "Multi" : "",
+                  ib ? "Elements" : "Arrays",
+                  indirect_params ? "CountARB" : "");
+      return;
+   }
+
+   prim[0].begin = 1;
+   prim[draw_count - 1].end = 1;
+   for (i = 0; i < draw_count; ++i, indirect_offset += stride) {
+      prim[i].mode = mode;
+      prim[i].indexed = !!ib;
+      prim[i].indirect_offset = indirect_offset;
+      prim[i].is_indirect = 1;
+      prim[i].draw_id = i;
+   }
+
+   vbo->draw_prims(ctx, prim, draw_count,
+                   ib, GL_TRUE, 0, ~0,
+                   NULL, 0,
+                   ctx->DrawIndirectBuffer);
+
+   free(prim);
+}
+
 
 GLboolean _vbo_CreateContext( struct gl_context *ctx )
 {
@@ -152,6 +194,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
    init_legacy_currval( ctx );
    init_generic_currval( ctx );
    init_mat_currval( ctx );
+   vbo_set_indirect_draw_func(ctx, vbo_draw_indirect_prims);
 
    /* Build mappings from VERT_ATTRIB -> VBO_ATTRIB depending on type
     * of vertex program active.
@@ -223,3 +266,10 @@ void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func)
    vbo->draw_prims = func;
 }
 
+
+void vbo_set_indirect_draw_func(struct gl_context *ctx,
+                                vbo_indirect_draw_func func)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   vbo->draw_indirect_prims = func;
+}
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index 6293a8b9edc..11f9b17c7c4 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -76,6 +76,12 @@ struct vbo_context {
     * is responsible for initiating any fallback actions required:
     */
    vbo_draw_func draw_prims;
+
+   /* Optional callback for indirect draws. This allows multidraws to not be
+    * broken up, as well as for the actual count to be passed in as a separate
+    * indirect parameter.
+    */
+   vbo_indirect_draw_func draw_indirect_prims;
 };
 
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 502b2885892..2589ff4f9e2 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1546,27 +1546,14 @@ vbo_validated_drawarraysindirect(struct gl_context *ctx,
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim prim[1];
 
    vbo_bind_arrays(ctx);
 
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].is_indirect = 1;
-   prim[0].indirect_offset = (GLsizeiptr)indirect;
-
-   /* NOTE: We do NOT want to handle primitive restart here, nor perform any
-    * other checks that require knowledge of the values in the command buffer.
-    * That would defeat the whole purpose of this function.
-    */
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, 1,
-                   NULL, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, (GLsizeiptr)indirect,
+                            1 /* draw_count */, 16 /* stride */,
+                            NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1580,36 +1567,18 @@ vbo_validated_multidrawarraysindirect(struct gl_context *ctx,
 {
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim *prim;
-   GLsizei i;
    GLsizeiptr offset = (GLsizeiptr)indirect;
 
    if (primcount == 0)
       return;
-   prim = calloc(primcount, sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawArraysIndirect");
-      return;
-   }
 
    vbo_bind_arrays(ctx);
 
-   prim[0].begin = 1;
-   prim[primcount - 1].end = 1;
-   for (i = 0; i < primcount; ++i, offset += stride) {
-      prim[i].mode = mode;
-      prim[i].indirect_offset = offset;
-      prim[i].is_indirect = 1;
-      prim[i].draw_id = i;
-   }
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, primcount,
-                   NULL, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
-
-   free(prim);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            primcount, stride,
+                            NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1623,7 +1592,6 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    struct _mesa_index_buffer ib;
-   struct _mesa_prim prim[1];
 
    vbo_bind_arrays(ctx);
 
@@ -1632,19 +1600,12 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].indexed = 1;
-   prim[0].indirect_offset = (GLsizeiptr)indirect;
-   prim[0].is_indirect = 1;
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, 1,
-                   &ib, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, (GLsizeiptr)indirect,
+                            1 /* draw_count */, 20 /* stride */,
+                            NULL, 0,
+                            &ib);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);
@@ -1659,17 +1620,10 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    struct _mesa_index_buffer ib;
-   struct _mesa_prim *prim;
-   GLsizei i;
    GLsizeiptr offset = (GLsizeiptr)indirect;
 
    if (primcount == 0)
       return;
-   prim = calloc(primcount, sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawElementsIndirect");
-      return;
-   }
 
    vbo_bind_arrays(ctx);
 
@@ -1680,23 +1634,12 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    ib.obj = ctx->Array.VAO->IndexBufferObj;
    ib.ptr = NULL;
 
-   prim[0].begin = 1;
-   prim[primcount - 1].end = 1;
-   for (i = 0; i < primcount; ++i, offset += stride) {
-      prim[i].mode = mode;
-      prim[i].indexed = 1;
-      prim[i].indirect_offset = offset;
-      prim[i].is_indirect = 1;
-      prim[i].draw_id = i;
-   }
-
    check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, primcount,
-                   &ib, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
-
-   free(prim);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            primcount, stride,
+                            NULL, 0,
+                            &ib);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
       _mesa_flush(ctx);

From 3e11656694857edcc98945da1a7eef40f3ba3836 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 13:07:49 -0500
Subject: [PATCH 191/241] gallium: add sufficient draw interface to allow new
 indirect features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes it possible to support indirect multidraws as well as having
the number of such draws to come from a separate GPU resource.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/pipe/p_state.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 84633633f55..2e4d2830199 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -620,7 +620,7 @@ struct pipe_draw_info
     */
    struct pipe_stream_output_target *count_from_stream_output;
 
-   /* Indirect parameters resource: If not NULL, most values are taken
+   /* Indirect draw parameters resource: If not NULL, most values are taken
     * from this buffer instead, which is laid out as follows:
     *
     * if indexed is TRUE:
@@ -641,6 +641,15 @@ struct pipe_draw_info
     */
    struct pipe_resource *indirect;
    unsigned indirect_offset; /**< must be 4 byte aligned */
+   unsigned indirect_stride; /**< must be 4 byte aligned */
+   unsigned indirect_count; /**< number of indirect draws */
+
+   /* Indirect draw count resource: If not NULL, contains a 32-bit value which
+    * is to be used as the real indirect_count. In that case indirect_count
+    * becomes the maximum possible value.
+    */
+   struct pipe_resource *indirect_params;
+   unsigned indirect_params_offset; /**< must be 4 byte aligned */
 };
 
 

From d67b9ba9a1af18306aa68f16ee1b9bbc124da42e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 13:30:13 -0500
Subject: [PATCH 192/241] gallium: add caps to expose support for multi
 indirect draws
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/docs/source/screen.rst               | 5 +++++
 src/gallium/drivers/freedreno/freedreno_screen.c | 2 ++
 src/gallium/drivers/i915/i915_screen.c           | 2 ++
 src/gallium/drivers/ilo/ilo_screen.c             | 2 ++
 src/gallium/drivers/llvmpipe/lp_screen.c         | 2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 2 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 2 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 2 ++
 src/gallium/drivers/r300/r300_screen.c           | 2 ++
 src/gallium/drivers/r600/r600_pipe.c             | 2 ++
 src/gallium/drivers/radeonsi/si_pipe.c           | 2 ++
 src/gallium/drivers/softpipe/sp_screen.c         | 2 ++
 src/gallium/drivers/svga/svga_screen.c           | 2 ++
 src/gallium/drivers/vc4/vc4_screen.c             | 2 ++
 src/gallium/drivers/virgl/virgl_screen.c         | 2 ++
 src/gallium/include/pipe/p_defines.h             | 2 ++
 16 files changed, 35 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 30d497f59e3..39ecc6334eb 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -213,6 +213,11 @@ The integer capabilities:
 * ``PIPE_CAP_DRAW_INDIRECT``: Whether the driver supports taking draw arguments
   { count, instance_count, start, index_bias } from a PIPE_BUFFER resource.
   See pipe_draw_info.
+* ``PIPE_CAP_MULTI_DRAW_INDIRECT``: Whether the driver supports
+  pipe_draw_info::indirect_stride and ::indirect_count
+* ``PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS``: Whether the driver supports
+  taking the number of indirect draws from a separate parameter
+  buffer, see pipe_draw_info::indirect_params.
 * ``PIPE_CAP_TGSI_FS_FINE_DERIVATIVE``: Whether the fragment shader supports
   the FINE versions of DDX/DDY.
 * ``PIPE_CAP_VENDOR_ID``: The vendor ID of the underlying hardware. If it's
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 4b377b4b087..a8030f2ff60 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -226,6 +226,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
 	case PIPE_CAP_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 7eab1755c5a..f42fc37abe5 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -266,6 +266,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 6b8e619d32a..3a18e7415b9 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -463,6 +463,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index f4a51ce0a49..0898cff3a2e 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -303,6 +303,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 6ef949de809..6c4a0f31ab6 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -157,6 +157,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_USER_VERTEX_BUFFERS:
    case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 5cc000a5d68..d6131c2f994 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -212,6 +212,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 174d35df1b3..a184e8fdd87 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -208,6 +208,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index c605ce5a7b8..5e67a2ffd15 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -183,6 +183,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SAMPLE_SHADING:
         case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
         case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
         case PIPE_CAP_SAMPLER_VIEW_TARGET:
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index d71082fddfd..563a892c4a2 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -350,6 +350,8 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 79bbc48d1f4..64f28d9c7be 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -342,6 +342,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT:
+	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 6af96d9edaa..e74044b5b0b 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -253,6 +253,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 8ad3c87b66f..17781118503 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -342,6 +342,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 2ee5a777d20..8bbacc60d41 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -171,6 +171,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_TEXEL_OFFSET:
         case PIPE_CAP_MAX_VERTEX_STREAMS:
         case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
         case PIPE_CAP_SAMPLER_VIEW_TARGET:
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 0418cbb7964..05ce58415e2 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -201,6 +201,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index d6881f90274..591bbf32556 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -637,6 +637,8 @@ enum pipe_cap
    PIPE_CAP_CLEAR_TEXTURE,
    PIPE_CAP_DRAW_PARAMETERS,
    PIPE_CAP_TGSI_PACK_HALF_FLOAT,
+   PIPE_CAP_MULTI_DRAW_INDIRECT,
+   PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From 2860f20859454b38ce44e4e3377c036e67c20ae7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 14:11:07 -0500
Subject: [PATCH 193/241] st/mesa: add support for new mesa indirect draw
 interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This shifts all indirect draws to go through the new function. If the
driver doesn't have support for multi draws, we break those up and
perform N draws. Otherwise, we pass everything through for just a single
draw call.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_context.c |  2 +
 src/mesa/state_tracker/st_context.h |  1 +
 src/mesa/state_tracker/st_draw.c    | 90 ++++++++++++++++++++++++++---
 3 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index e12c1663d3f..87193a9d478 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -267,6 +267,8 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
       screen->get_param(screen, PIPE_CAP_QUERY_TIME_ELAPSED);
    st->has_half_float_packing =
       screen->get_param(screen, PIPE_CAP_TGSI_PACK_HALF_FLOAT);
+   st->has_multi_draw_indirect =
+      screen->get_param(screen, PIPE_CAP_MULTI_DRAW_INDIRECT);
 
    /* GL limits and extensions */
    st_init_limits(st->pipe->screen, &ctx->Const, &ctx->Extensions);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 91b0f975f3f..9db5f11beb5 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -102,6 +102,7 @@ struct st_context
    boolean force_persample_in_shader;
    boolean has_shareable_shaders;
    boolean has_half_float_packing;
+   boolean has_multi_draw_indirect;
 
    /**
     * If a shader can be created when we get its source.
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index d7a97169bc2..03788f33468 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -252,13 +252,7 @@ st_draw_vbo(struct gl_context *ctx,
       }
    }
 
-   if (indirect) {
-      info.indirect = st_buffer_object(indirect)->buffer;
-
-      /* Primitive restart is not handled by the VBO module in this case. */
-      info.primitive_restart = ctx->Array._PrimitiveRestart;
-      info.restart_index = ctx->Array.RestartIndex;
-   }
+   assert(!indirect);
 
    /* do actual drawing */
    for (i = 0; i < nr_prims; i++) {
@@ -274,7 +268,6 @@ st_draw_vbo(struct gl_context *ctx,
          info.min_index = info.start;
          info.max_index = info.start + info.count - 1;
       }
-      info.indirect_offset = prims[i].indirect_offset;
 
       if (ST_DEBUG & DEBUG_DRAW) {
          debug_printf("st/draw: mode %s  start %u  count %u  indexed %d\n",
@@ -284,7 +277,7 @@ st_draw_vbo(struct gl_context *ctx,
                       info.indexed);
       }
 
-      if (info.count_from_stream_output || info.indirect) {
+      if (info.count_from_stream_output) {
          cso_draw_vbo(st->cso_context, &info);
       }
       else if (info.primitive_restart) {
@@ -301,6 +294,84 @@ st_draw_vbo(struct gl_context *ctx,
    }
 }
 
+static void
+st_indirect_draw_vbo(struct gl_context *ctx,
+                     GLuint mode,
+                     struct gl_buffer_object *indirect_data,
+                     GLsizeiptr indirect_offset,
+                     unsigned draw_count,
+                     unsigned stride,
+                     struct gl_buffer_object *indirect_params,
+                     GLsizeiptr indirect_params_offset,
+                     const struct _mesa_index_buffer *ib)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_index_buffer ibuffer = {0};
+   struct pipe_draw_info info;
+
+   /* Mesa core state should have been validated already */
+   assert(ctx->NewState == 0x0);
+   assert(stride);
+
+   /* Validate state. */
+   if (st->dirty.st || ctx->NewDriverState) {
+      st_validate_state(st);
+   }
+
+   if (st->vertex_array_out_of_memory) {
+      return;
+   }
+
+   util_draw_init_info(&info);
+
+   if (ib) {
+      if (!setup_index_buffer(st, ib, &ibuffer)) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDrawElementsIndirect%s",
+                     (draw_count > 1) ? "Multi" : "",
+                     indirect_params ? "CountARB" : "");
+         return;
+      }
+
+      info.indexed = TRUE;
+   }
+
+   info.mode = translate_prim(ctx, mode);
+   info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
+   info.indirect = st_buffer_object(indirect_data)->buffer;
+   info.indirect_offset = indirect_offset;
+
+   /* Primitive restart is not handled by the VBO module in this case. */
+   info.primitive_restart = ctx->Array._PrimitiveRestart;
+   info.restart_index = ctx->Array.RestartIndex;
+
+   if (ST_DEBUG & DEBUG_DRAW) {
+      debug_printf("st/draw indirect: mode %s drawcount %d indexed %d\n",
+                   u_prim_name(info.mode),
+                   draw_count,
+                   info.indexed);
+   }
+
+   if (!st->has_multi_draw_indirect) {
+      int i;
+
+      assert(!indirect_params);
+      info.indirect_count = 1;
+      for (i = 0; i < draw_count; i++) {
+         info.drawid = i;
+         cso_draw_vbo(st->cso_context, &info);
+         info.indirect_offset += stride;
+      }
+   } else {
+      info.indirect_count = draw_count;
+      info.indirect_stride = stride;
+      if (indirect_params) {
+         info.indirect_params = st_buffer_object(indirect_params)->buffer;
+         info.indirect_params_offset = indirect_params_offset;
+      }
+      cso_draw_vbo(st->cso_context, &info);
+   }
+}
+
 
 void
 st_init_draw(struct st_context *st)
@@ -308,6 +379,7 @@ st_init_draw(struct st_context *st)
    struct gl_context *ctx = st->ctx;
 
    vbo_set_draw_func(ctx, st_draw_vbo);
+   vbo_set_indirect_draw_func(ctx, st_indirect_draw_vbo);
 
    st->draw = draw_create(st->pipe); /* for selection/feedback */
 

From d3e43baffe06d8375b63cf8009410391f4900e05 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 2 Jan 2016 00:06:22 -0500
Subject: [PATCH 194/241] nvc0: adjust indirect draw macros to handle multiple
 draws at once

These are still invoked one at a time, but the underlying macro can
handle multiple draws.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/mme/com9097.mme      | 75 +++++++++++++------
 .../drivers/nouveau/nvc0/mme/com9097.mme.h    | 58 +++++++++-----
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   | 20 ++---
 3 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index 8c647d0c66c..35355edf2e7 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -230,29 +230,43 @@ locn_0f_ts:
  * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
  *
  * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = index_bias
- * parm[4] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 5n + 0] = count
+ * parm[2 + 5n + 1] = instance_count
+ * parm[2 + 5n + 2] = start
+ * parm[2 + 5n + 3] = index_bias
+ * parm[2 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
  */
 .section #mme9097_draw_elts_indirect
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dei_draw_again:
    parm $r3 /* count */
    parm $r2 /* instance_count */
    parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
    parm $r4 send $r4 /* index_bias, send start */
-   maddr 0x8e4 /* CB_DATA */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
    braz $r2 #dei_end
    parm $r5 send $r4 /* start_instance, send index_bias */
    send $r5 /* send start_instance */
-   read $r6 0x50d /* VB_ELEMENT_BASE */
-   read $r7 0x50e /* VB_INSTANCE_BASE */
+   send $r6 /* draw id */
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r4
    send $r5
    maddr 0x446
    send $r4
    mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
 dei_again:
    maddr 0x586 /* VERTEX_BEGIN_GL */
    send $r1 /* mode */
@@ -262,48 +276,61 @@ dei_again:
    maddrsend 0x585 /* VERTEX_END_GL */
    branz $r2 #dei_again
    mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dei_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dei_draw_again
+   mov $r6 (add $r6 1)
+   read $r6 0xd00
+   read $r7 0xd01
    maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
    send $r6
    send $r7
    exit maddr 0x446
    send $r6
-dei_end:
-   exit
-   nop
 
 /* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT:
  *
  * NOTE: Saves and restores VB_INSTANCE_BASE.
  *
  * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 4n + 0] = count
+ * parm[2 + 4n + 1] = instance_count
+ * parm[2 + 4n + 2] = start
+ * parm[2 + 4n + 3] = start_instance
  */
 .section #mme9097_draw_arrays_indirect
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dai_draw_again:
    parm $r2 /* count */
    parm $r3 /* instance_count */
    parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
    braz $r3 #dai_end
    parm $r4 send $r4 /* start_instance */
-   maddrsend 0x8e4 /* CB_DATA, send 0 as base_vertex */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
    send $r4 /* send start_instance */
-   read $r6 0x50e /* VB_INSTANCE_BASE */
+   send $r6 /* draw id */
    maddr 0x50e /* VB_INSTANCE_BASE */
-   mov $r5 0x1
    send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
 dai_again:
    maddr 0x586 /* VERTEX_BEGIN_GL */
    send $r1 /* mode */
    maddr 0x35e /* VERTEX_BUFFER_COUNT */
    send $r2
-   mov $r3 (sub $r3 $r5)
+   mov $r3 (sub $r3 $r4)
    maddrsend 0x585 /* VERTEX_END_GL */
    branz $r3 #dai_again
-   mov $r1 (extrinsrt $r1 $r5 0 1 26) /* set INSTANCE_NEXT */
-   exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
-   send $r6
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
 dai_end:
-   exit
-   nop
+   mov $r7 (add $r7 -1)
+   branz $r7 #dai_draw_again
+   mov $r6 (add $r6 1)
+   exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index acad303ce60..0aebeeb6e66 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -125,24 +125,33 @@ uint32_t mme9097_tep_select[] = {
 };
 
 uint32_t mme9097_draw_elts_indirect[] = {
+	0x01434615,
+/* 0x0007: dei_draw_again */
+	0x01438715,
+	0x07400021,
+	0x00003041,
+	0x00003841,
+	0x00000601,
+/* 0x0018: dei_again */
+	0x00000701,
 	0x00000301,
+/* 0x0020: dei_end */
 	0x00000201,
 	0x017dc451,
 	0x00002431,
-/* 0x0010: dei_again */
-	0x02390021,
-	0x00061007,
+	0x0638c021,
+	0x00600041,
+	0x0004d007,
 	0x00002531,
-/* 0x001d: dei_end */
 	0x00002841,
-	0x01434615,
-	0x01438715,
+	0x00003041,
 	0x05434021,
 	0x00002041,
 	0x00002841,
 	0x01118021,
 	0x00002041,
 	0x00004411,
+	0xd0400912,
 	0x01618021,
 	0x00000841,
 	0x017e0021,
@@ -151,39 +160,50 @@ uint32_t mme9097_draw_elts_indirect[] = {
 	0x01614071,
 	0xfffe9017,
 	0xd0410912,
+	0xffffff11,
+	0xfff9b817,
+	0x00007611,
+	0x03400615,
+	0x03404715,
 	0x05434021,
 	0x00003041,
 	0x00003841,
 	0x011180a1,
 	0x00003041,
-	0x00000091,
-	0x00000011,
 };
 
 uint32_t mme9097_draw_arrays_indirect[] = {
+/* 0x0003: dai_draw_again */
+	0x01438515,
+	0x00000601,
+	0x00000701,
 	0x00000201,
+/* 0x0011: dai_again */
 	0x00000301,
-/* 0x000b: dai_again */
 	0x00d74451,
-	0x00049807,
+/* 0x0019: dai_end */
+	0x0004d807,
 	0x00002431,
-/* 0x0015: dai_end */
-	0x02390071,
+	0x0638c021,
+	0x00600041,
+	0x00000041,
 	0x00002041,
-	0x01438615,
+	0x00003041,
 	0x01438021,
-	0x00004511,
 	0x00002041,
+	0x00004411,
+	0xd0400912,
 	0x01618021,
 	0x00000841,
 	0x00d78021,
 	0x00001041,
-	0x00055b10,
+	0x00051b10,
 	0x01614071,
 	0xfffe9817,
-	0xd0414912,
+	0xd0410912,
+	0xffffff11,
+	0xfffa7817,
+	0x00007611,
 	0x014380a1,
-	0x00003041,
-	0x00000091,
-	0x00000011,
+	0x00002841,
 };
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 251753357eb..55aeb806288 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -819,8 +819,6 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    PUSH_DATA (push, 512);
    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
-   BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
-   PUSH_DATA (push, 256 + 128);
 
    nouveau_pushbuf_space(push, 8, 0, 1);
    PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
@@ -828,7 +826,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
       assert(nvc0->idxbuf.buffer);
       assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
       size = 5 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 1 + size / 4);
+      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 3 + size / 4);
    } else {
       if (nvc0->state.index_bias) {
          /* index_bias is implied 0 if !info->indexed (really ?) */
@@ -837,9 +835,11 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          nvc0->state.index_bias = 0;
       }
       size = 4 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 1 + size / 4);
+      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 3 + size / 4);
    }
    PUSH_DATA(push, nvc0_prim_gl(info->mode));
+   PUSH_DATA(push, info->drawid);
+   PUSH_DATA(push, 1);
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
    nouveau_pushbuf_data(push,
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
@@ -913,11 +913,13 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_DATA (push, 512);
       PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
       PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
-      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
-      PUSH_DATA (push, 256 + 128);
-      PUSH_DATA (push, info->index_bias);
-      PUSH_DATA (push, info->start_instance);
-      PUSH_DATA (push, info->drawid);
+      if (!info->indirect) {
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+         PUSH_DATA (push, 256 + 128);
+         PUSH_DATA (push, info->index_bias);
+         PUSH_DATA (push, info->start_instance);
+         PUSH_DATA (push, info->drawid);
+      }
    }
 
    push->kick_notify = nvc0_draw_vbo_kick_notify;

From 7ca67c752bca08a38a7334cace15ce2b8429a318 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 2 Jan 2016 00:45:56 -0500
Subject: [PATCH 195/241] nvc0: add support for real ARB_multi_draw_indirect

The draw groups are now split up into groups of 32 if there's a
non-packed stride, or in groups of 400-500 if the draw data is packed.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nvc0_query_hw.c      |  2 -
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   | 58 ++++++++++++++-----
 .../drivers/nouveau/nvc0/nvc0_winsys.h        |  1 +
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index 8021a65dc46..1bed0162baf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -470,8 +470,6 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 {
    struct nvc0_hw_query *hq = nvc0_hw_query(q);
 
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-
    PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
    nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                         NVC0_IB_ENTRY_1_NO_PREFETCH);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index a184e8fdd87..86bd8632d0b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -186,6 +186,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -208,7 +209,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
       return 0;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 55aeb806288..1d889b9db0d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -807,8 +807,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nv04_resource *buf = nv04_resource(info->indirect);
-   unsigned size;
-   const uint32_t offset = buf->offset + info->indirect_offset;
+   unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
+   uint32_t offset = buf->offset + info->indirect_offset;
 
    /* must make FIFO wait for engines idle before continuing to process */
    if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
@@ -820,13 +820,11 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
 
-   nouveau_pushbuf_space(push, 8, 0, 1);
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
       assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
-      size = 5 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 3 + size / 4);
+      size = 5;
+      macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
    } else {
       if (nvc0->state.index_bias) {
          /* index_bias is implied 0 if !info->indexed (really ?) */
@@ -834,15 +832,47 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
          nvc0->state.index_bias = 0;
       }
-      size = 4 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 3 + size / 4);
+      size = 4;
+      macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT;
+   }
+
+   /* If the stride is not the natural stride, we have to stick a separate
+    * push data reference for each draw. Otherwise it can all go in as one.
+    * Of course there is a maximum packet size, so we have to break things up
+    * along those borders as well.
+    */
+   while (count) {
+      unsigned draws = count, pushes, i;
+      if (info->indirect_stride == size * 4) {
+         draws = MIN2(draws, (NV04_PFIFO_MAX_PACKET_LEN - 4) / size);
+         pushes = 1;
+      } else {
+         draws = MIN2(draws, 32);
+         pushes = draws;
+      }
+
+      nouveau_pushbuf_space(push, 8, 0, pushes);
+      PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
+      PUSH_DATA(push, NVC0_FIFO_PKHDR_1I(0, macro, 3 + draws * size));
+      PUSH_DATA(push, nvc0_prim_gl(info->mode));
+      PUSH_DATA(push, drawid);
+      PUSH_DATA(push, draws);
+      if (pushes == 1) {
+         nouveau_pushbuf_data(push,
+                              buf->bo, offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4 * draws));
+         offset += draws * info->indirect_stride;
+      } else {
+         for (i = 0; i < pushes; i++) {
+            nouveau_pushbuf_data(push,
+                                 buf->bo, offset,
+                                 NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4));
+            offset += info->indirect_stride;
+         }
+      }
+      count -= draws;
+      drawid += draws;
    }
-   PUSH_DATA(push, nvc0_prim_gl(info->mode));
-   PUSH_DATA(push, info->drawid);
-   PUSH_DATA(push, 1);
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   nouveau_pushbuf_data(push,
-                        buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }
 
 static inline void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
index 4ea8ca3cfa2..79abe78b77a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -68,6 +68,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_SW(m) 7, (m)
 
 #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
 
 static inline uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)

From b3e2c21fe5af4ab2f0f1584b715a1ab3c5eb5ca3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 15:19:51 -0500
Subject: [PATCH 196/241] glapi: add ARB_indirect_parameters definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 .../glapi/gen/ARB_indirect_parameters.xml     | 30 +++++++++++++++++++
 src/mapi/glapi/gen/Makefile.am                |  1 +
 src/mapi/glapi/gen/gl_API.xml                 |  6 +++-
 src/mesa/main/extensions_table.h              |  1 +
 src/mesa/main/mtypes.h                        |  1 +
 src/mesa/main/tests/dispatch_sanity.cpp       |  4 +++
 src/mesa/vbo/vbo_exec_array.c                 | 21 +++++++++++++
 7 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 src/mapi/glapi/gen/ARB_indirect_parameters.xml

diff --git a/src/mapi/glapi/gen/ARB_indirect_parameters.xml b/src/mapi/glapi/gen/ARB_indirect_parameters.xml
new file mode 100644
index 00000000000..20de9057707
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_indirect_parameters.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_ARB_draw_indirect" number="154">
+
+    <enum name="PARAMETER_BUFFER_ARB"                   value="0x80EE"/>
+    <enum name="PARAMETER_BUFFER_BINDING_ARB"           value="0x80EF"/>
+
+    <function name="MultiDrawArraysIndirectCountARB" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="indirect" type="GLintptr"/>
+        <param name="drawcount" type="GLintptr"/>
+        <param name="maxdrawcount" type="GLsizei"/>
+        <param name="stride" type="GLsizei"/>
+    </function>
+
+    <function name="MultiDrawElementsIndirectCountARB" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="type" type="GLenum"/>
+        <param name="indirect" type="GLintptr"/>
+        <param name="drawcount" type="GLintptr"/>
+        <param name="maxdrawcount" type="GLsizei"/>
+        <param name="stride" type="GLsizei"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 2da8f7ddd9d..900b61a5d45 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -137,6 +137,7 @@ API_XML = \
 	ARB_get_texture_sub_image.xml \
 	ARB_gpu_shader_fp64.xml \
 	ARB_gpu_shader5.xml \
+	ARB_indirect_parameters.xml \
 	ARB_instanced_arrays.xml \
 	ARB_internalformat_query.xml \
 	ARB_invalidate_subdata.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 21f6293bb6c..593ace49563 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8247,7 +8247,11 @@
 
 <xi:include href="ARB_multi_bind.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions 148 - 159 -->
+<!-- ARB extensions 148 - 153 -->
+
+<xi:include href="ARB_indirect_parameters.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extensions 155 - 159 -->
 
 <xi:include href="ARB_clip_control.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 789b55a3c8d..aeccb017423 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -70,6 +70,7 @@ EXT(ARB_gpu_shader5                         , ARB_gpu_shader5
 EXT(ARB_gpu_shader_fp64                     , ARB_gpu_shader_fp64                    ,  x , GLC,  x ,  x , 2010)
 EXT(ARB_half_float_pixel                    , dummy_true                             , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_half_float_vertex                   , ARB_half_float_vertex                  , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_indirect_parameters                 , ARB_indirect_parameters                ,  x , GLC,  x ,  x , 2013)
 EXT(ARB_instanced_arrays                    , ARB_instanced_arrays                   , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_internalformat_query                , ARB_internalformat_query               , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_invalidate_subdata                  , dummy_true                             , GLL, GLC,  x ,  x , 2012)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5b9fce8b7cc..5cd2e8eb3af 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3700,6 +3700,7 @@ struct gl_extensions
    GLboolean ARB_gpu_shader5;
    GLboolean ARB_gpu_shader_fp64;
    GLboolean ARB_half_float_vertex;
+   GLboolean ARB_indirect_parameters;
    GLboolean ARB_instanced_arrays;
    GLboolean ARB_internalformat_query;
    GLboolean ARB_map_buffer_range;
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index d288b1dbe94..7610bcbd701 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -1844,6 +1844,10 @@ const struct function gl_core_functions_possible[] = {
    { "glGetQueryBufferObjecti64v", 45, -1 },
    { "glGetQueryBufferObjectui64v", 45, -1 },
 
+   /* GL_ARB_indirect_parameters */
+   { "glMultiDrawArraysIndirectCountARB", 31, -1 },
+   { "glMultiDrawElementsIndirectCountARB", 31, -1 },
+
    { NULL, 0, -1 }
 };
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 2589ff4f9e2..c5019b1aa59 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1732,6 +1732,25 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
                                            primcount, stride);
 }
 
+static void GLAPIENTRY
+vbo_exec_MultiDrawArraysIndirectCount(GLenum mode,
+                                      GLintptr indirect,
+                                      GLintptr drawcount,
+                                      GLsizei maxdrawcount, GLsizei stride)
+{
+
+}
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawElementsIndirectCount(GLenum mode, GLenum type,
+                                        GLintptr indirect,
+                                        GLintptr drawcount,
+                                        GLsizei maxdrawcount, GLsizei stride)
+{
+
+}
+
+
 /**
  * Initialize the dispatch table with the VBO functions for drawing.
  */
@@ -1779,6 +1798,8 @@ vbo_initialize_exec_dispatch(const struct gl_context *ctx,
    if (ctx->API == API_OPENGL_CORE) {
       SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
       SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
+      SET_MultiDrawArraysIndirectCountARB(exec, vbo_exec_MultiDrawArraysIndirectCount);
+      SET_MultiDrawElementsIndirectCountARB(exec, vbo_exec_MultiDrawElementsIndirectCount);
    }
 
    if (_mesa_is_desktop_gl(ctx) || _mesa_is_gles3(ctx)) {

From 9327e2d312e5da58f1cf4dbb806c67fcefd892f5 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 15:47:17 -0500
Subject: [PATCH 197/241] mesa: add parameter buffer, used for
 ARB_indirect_parameters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/bufferobj.c        | 15 +++++++++++++++
 src/mesa/main/get.c              |  5 +++++
 src/mesa/main/get_hash_params.py |  4 ++++
 src/mesa/main/mtypes.h           |  1 +
 4 files changed, 25 insertions(+)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index a1e47d62773..d7c5680661b 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -127,6 +127,11 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
          return &ctx->DrawIndirectBuffer;
       }
       break;
+   case GL_PARAMETER_BUFFER_ARB:
+      if (_mesa_has_ARB_indirect_parameters(ctx)) {
+         return &ctx->ParameterBuffer;
+      }
+      break;
    case GL_DISPATCH_INDIRECT_BUFFER:
       if (_mesa_has_compute_shaders(ctx)) {
          return &ctx->DispatchIndirectBuffer;
@@ -866,6 +871,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer,
 				 ctx->Shared->NullBufferObj);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ParameterBuffer,
+				 ctx->Shared->NullBufferObj);
+
    _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer,
 				 ctx->Shared->NullBufferObj);
 
@@ -913,6 +921,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer, NULL);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ParameterBuffer, NULL);
+
    _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer, NULL);
 
    for (i = 0; i < MAX_COMBINED_UNIFORM_BUFFERS; i++) {
@@ -1261,6 +1271,11 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
             _mesa_BindBuffer( GL_DRAW_INDIRECT_BUFFER, 0 );
          }
 
+         /* unbind ARB_indirect_parameters binding point */
+         if (ctx->ParameterBuffer == bufObj) {
+            _mesa_BindBuffer(GL_PARAMETER_BUFFER_ARB, 0);
+         }
+
          /* unbind ARB_compute_shader binding point */
          if (ctx->DispatchIndirectBuffer == bufObj) {
             _mesa_BindBuffer(GL_DISPATCH_INDIRECT_BUFFER, 0);
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index c6a2e5b912c..95cb18c8ee8 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -423,6 +423,7 @@ EXTRA_EXT(ARB_framebuffer_no_attachments);
 EXTRA_EXT(ARB_tessellation_shader);
 EXTRA_EXT(ARB_shader_subroutine);
 EXTRA_EXT(ARB_shader_storage_buffer_object);
+EXTRA_EXT(ARB_indirect_parameters);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -1032,6 +1033,10 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
    case GL_DRAW_INDIRECT_BUFFER_BINDING:
       v->value_int = ctx->DrawIndirectBuffer->Name;
       break;
+   /* GL_ARB_indirect_parameters */
+   case GL_PARAMETER_BUFFER_BINDING_ARB:
+      v->value_int = ctx->ParameterBuffer->Name;
+      break;
    /* GL_ARB_separate_shader_objects */
    case GL_PROGRAM_PIPELINE_BINDING:
       if (ctx->Pipeline.Current) {
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 7a48ed2f414..af7a8f4a906 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -887,6 +887,10 @@ descriptor=[
 # GL_ARB_shader_subroutine
   [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
   [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
+
+# GL_ARB_indirect_parameters
+  [ "PARAMETER_BUFFER_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_indirect_parameters" ],
+
 ]}
 
 ]
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5cd2e8eb3af..dd52368ef65 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -4349,6 +4349,7 @@ struct gl_context
    struct gl_perf_monitor_state PerfMonitor;
 
    struct gl_buffer_object *DrawIndirectBuffer; /** < GL_ARB_draw_indirect */
+   struct gl_buffer_object *ParameterBuffer; /** < GL_ARB_indirect_parameters */
    struct gl_buffer_object *DispatchIndirectBuffer; /** < GL_ARB_compute_shader */
 
    struct gl_buffer_object *CopyReadBuffer; /**< GL_ARB_copy_buffer */

From e1eab5a76f20061c005c8254b11ca1611ebda8f7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 16:11:56 -0500
Subject: [PATCH 198/241] mesa: add support for ARB_indirect_parameters draw
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/api_validate.c  | 115 ++++++++++++++++++++++++++++++++++
 src/mesa/main/api_validate.h  |  16 +++++
 src/mesa/vbo/vbo_exec_array.c | 103 ++++++++++++++++++++++++++++++
 3 files changed, 234 insertions(+)

diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index d693ec64ce4..2b629977644 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -920,6 +920,121 @@ _mesa_validate_MultiDrawElementsIndirect(struct gl_context *ctx,
    return GL_TRUE;
 }
 
+static GLboolean
+valid_draw_indirect_parameters(struct gl_context *ctx,
+                               const char *name,
+                               GLintptr drawcount)
+{
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_VALUE is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if <drawcount> is not a multiple of
+    *  four."
+    */
+   if (drawcount & 3) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(drawcount is not a multiple of 4)", name);
+      return GL_FALSE;
+   }
+
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_OPERATION is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if no buffer is bound to the
+    *  PARAMETER_BUFFER_ARB binding point."
+    */
+   if (!_mesa_is_bufferobj(ctx->ParameterBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s: no buffer bound to PARAMETER_BUFFER", name);
+      return GL_FALSE;
+   }
+
+   if (_mesa_check_disallowed_mapping(ctx->ParameterBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(PARAMETER_BUFFER is mapped)", name);
+      return GL_FALSE;
+   }
+
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_OPERATION is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if reading a <sizei> typed value
+    *  from the buffer bound to the PARAMETER_BUFFER_ARB target at the offset
+    *  specified by <drawcount> would result in an out-of-bounds access."
+    */
+   if (ctx->ParameterBuffer->Size < drawcount + sizeof(GLsizei)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(PARAMETER_BUFFER too small)", name);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+GLboolean
+_mesa_validate_MultiDrawArraysIndirectCount(struct gl_context *ctx,
+                                            GLenum mode,
+                                            GLintptr indirect,
+                                            GLintptr drawcount,
+                                            GLsizei maxdrawcount,
+                                            GLsizei stride)
+{
+   GLsizeiptr size = 0;
+   const unsigned drawArraysNumParams = 4;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   /* caller has converted stride==0 to drawArraysNumParams * sizeof(GLuint) */
+   assert(stride != 0);
+
+   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                  "glMultiDrawArraysIndirectCountARB"))
+      return GL_FALSE;
+
+   /* number of bytes of the indirect buffer which will be read */
+   size = maxdrawcount
+      ? (maxdrawcount - 1) * stride + drawArraysNumParams * sizeof(GLuint)
+      : 0;
+
+   if (!valid_draw_indirect(ctx, mode, (void *)indirect, size,
+                            "glMultiDrawArraysIndirectCountARB"))
+      return GL_FALSE;
+
+   return valid_draw_indirect_parameters(
+         ctx, "glMultiDrawArraysIndirectCountARB", drawcount);
+}
+
+GLboolean
+_mesa_validate_MultiDrawElementsIndirectCount(struct gl_context *ctx,
+                                              GLenum mode, GLenum type,
+                                              GLintptr indirect,
+                                              GLintptr drawcount,
+                                              GLsizei maxdrawcount,
+                                              GLsizei stride)
+{
+   GLsizeiptr size = 0;
+   const unsigned drawElementsNumParams = 5;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   /* caller has converted stride==0 to drawElementsNumParams * sizeof(GLuint) */
+   assert(stride != 0);
+
+   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                  "glMultiDrawElementsIndirectCountARB"))
+      return GL_FALSE;
+
+   /* number of bytes of the indirect buffer which will be read */
+   size = maxdrawcount
+      ? (maxdrawcount - 1) * stride + drawElementsNumParams * sizeof(GLuint)
+      : 0;
+
+   if (!valid_draw_indirect_elements(ctx, mode, type,
+                                     (void *)indirect, size,
+                                     "glMultiDrawElementsIndirectCountARB"))
+      return GL_FALSE;
+
+   return valid_draw_indirect_parameters(
+         ctx, "glMultiDrawElementsIndirectCountARB", drawcount);
+}
+
 static bool
 check_valid_to_compute(struct gl_context *ctx, const char *function)
 {
diff --git a/src/mesa/main/api_validate.h b/src/mesa/main/api_validate.h
index 5d030a7ba37..5b321e3ac99 100644
--- a/src/mesa/main/api_validate.h
+++ b/src/mesa/main/api_validate.h
@@ -105,6 +105,22 @@ _mesa_validate_MultiDrawElementsIndirect(struct gl_context *ctx,
                                          GLsizei primcount,
                                          GLsizei stride);
 
+extern GLboolean
+_mesa_validate_MultiDrawArraysIndirectCount(struct gl_context *ctx,
+                                            GLenum mode,
+                                            GLintptr indirect,
+                                            GLintptr drawcount,
+                                            GLsizei maxdrawcount,
+                                            GLsizei stride);
+
+extern GLboolean
+_mesa_validate_MultiDrawElementsIndirectCount(struct gl_context *ctx,
+                                              GLenum mode, GLenum type,
+                                              GLintptr indirect,
+                                              GLintptr drawcount,
+                                              GLsizei maxdrawcount,
+                                              GLsizei stride);
+
 extern GLboolean
 _mesa_validate_DispatchCompute(struct gl_context *ctx,
                                const GLuint *num_groups);
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index c5019b1aa59..02139ef881f 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1732,13 +1732,96 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
                                            primcount, stride);
 }
 
+static void
+vbo_validated_multidrawarraysindirectcount(struct gl_context *ctx,
+                                           GLenum mode,
+                                           GLintptr indirect,
+                                           GLintptr drawcount,
+                                           GLsizei maxdrawcount,
+                                           GLsizei stride)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   GLsizeiptr offset = indirect;
+
+   if (maxdrawcount == 0)
+      return;
+
+   vbo_bind_arrays(ctx);
+
+   check_buffers_are_unmapped(exec->array.inputs);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            maxdrawcount, stride,
+                            ctx->ParameterBuffer, drawcount,
+                            NULL);
+
+   if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
+      _mesa_flush(ctx);
+}
+
+static void
+vbo_validated_multidrawelementsindirectcount(struct gl_context *ctx,
+                                             GLenum mode, GLenum type,
+                                             GLintptr indirect,
+                                             GLintptr drawcount,
+                                             GLsizei maxdrawcount,
+                                             GLsizei stride)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct _mesa_index_buffer ib;
+   GLsizeiptr offset = (GLsizeiptr)indirect;
+
+   if (maxdrawcount == 0)
+      return;
+
+   vbo_bind_arrays(ctx);
+
+   /* NOTE: IndexBufferObj is guaranteed to be a VBO. */
+
+   ib.count = 0; /* unknown */
+   ib.type = type;
+   ib.obj = ctx->Array.VAO->IndexBufferObj;
+   ib.ptr = NULL;
+
+   check_buffers_are_unmapped(exec->array.inputs);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            maxdrawcount, stride,
+                            ctx->ParameterBuffer, drawcount,
+                            &ib);
+
+   if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
+      _mesa_flush(ctx);
+}
+
 static void GLAPIENTRY
 vbo_exec_MultiDrawArraysIndirectCount(GLenum mode,
                                       GLintptr indirect,
                                       GLintptr drawcount,
                                       GLsizei maxdrawcount, GLsizei stride)
 {
+   GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glMultiDrawArraysIndirectCountARB"
+                  "(%s, %lx, %lx, %i, %i)\n",
+                  _mesa_enum_to_string(mode), indirect,
+                  drawcount, maxdrawcount, stride);
+
+   /* If <stride> is zero, the array elements are treated as tightly packed. */
+   if (stride == 0)
+      stride = 4 * sizeof(GLuint); /* sizeof(DrawArraysIndirectCommand) */
+
+   if (!_mesa_validate_MultiDrawArraysIndirectCount(ctx, mode,
+                                                    indirect, drawcount,
+                                                    maxdrawcount, stride))
+      return;
+
+   vbo_validated_multidrawarraysindirectcount(ctx, mode,
+                                              indirect, drawcount,
+                                              maxdrawcount, stride);
 }
 
 static void GLAPIENTRY
@@ -1747,7 +1830,27 @@ vbo_exec_MultiDrawElementsIndirectCount(GLenum mode, GLenum type,
                                         GLintptr drawcount,
                                         GLsizei maxdrawcount, GLsizei stride)
 {
+   GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glMultiDrawElementsIndirectCountARB"
+                  "(%s, %s, %lx, %lx, %i, %i)\n",
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect,
+                  drawcount, maxdrawcount, stride);
+
+   /* If <stride> is zero, the array elements are treated as tightly packed. */
+   if (stride == 0)
+      stride = 5 * sizeof(GLuint); /* sizeof(DrawElementsIndirectCommand) */
+
+   if (!_mesa_validate_MultiDrawElementsIndirectCount(ctx, mode, type,
+                                                      indirect, drawcount,
+                                                      maxdrawcount, stride))
+      return;
+
+   vbo_validated_multidrawelementsindirectcount(ctx, mode, type,
+                                                indirect, drawcount,
+                                                maxdrawcount, stride);
 }
 
 

From 9a54ccf30a1342a0ec55c30358b88ee5df5080ce Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Dec 2015 16:17:19 -0500
Subject: [PATCH 199/241] st/mesa: expose ARB_indirect_parameters when the
 backend driver allows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_cb_bufferobjects.c | 1 +
 src/mesa/state_tracker/st_extensions.c       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index c75f4765b94..8ca7e4c379b 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -228,6 +228,7 @@ st_bufferobj_data(struct gl_context *ctx,
       bind = PIPE_BIND_CONSTANT_BUFFER;
       break;
    case GL_DRAW_INDIRECT_BUFFER:
+   case GL_PARAMETER_BUFFER_ARB:
       bind = PIPE_BIND_COMMAND_ARGS_BUFFER;
       break;
    default:
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 9b01bdc129e..91ad169d786 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -453,6 +453,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
       { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
+      { o(ARB_indirect_parameters),          PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS       },
       { o(ARB_instanced_arrays),             PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR  },
       { o(ARB_occlusion_query),              PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },

From 67b31b3c59a3b950897709d6c472348c4e12951c Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 2 Jan 2016 11:38:42 -0500
Subject: [PATCH 200/241] nvc0: add ARB_indirect_parameters support

I chose to make separate macros for this due to the additional
complexity and extra scratch usage.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 docs/relnotes/11.2.0.html                     |   1 +
 .../drivers/nouveau/nvc0/mme/com9097.mme      | 157 ++++++++++++++++++
 .../drivers/nouveau/nvc0/mme/com9097.mme.h    | 125 ++++++++++++++
 .../drivers/nouveau/nvc0/nvc0_macros.h        |   4 +
 .../drivers/nouveau/nvc0/nvc0_screen.c        |   4 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   |  29 +++-
 6 files changed, 314 insertions(+), 6 deletions(-)

diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html
index d31da8ba4a6..616c134a768 100644
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -47,6 +47,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_base_instance on freedreno/a4xx</li>
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
+<li>GL_ARB_indirect_parameters on nvc0</li>
 <li>GL_ARB_shader_draw_parameters on i965, nvc0</li>
 <li>GL_ARB_tessellation_shader on i965 and r600 (evergreen/cayman only)</li>
 <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index 35355edf2e7..4daa57d47bb 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -334,3 +334,160 @@ dai_end:
    mov $r6 (add $r6 1)
    exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
    send $r5
+
+/* NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT
+ *
+ * NOTE: Saves and restores VB_ELEMENT,INSTANCE_BASE.
+ * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 5n + 0] = count
+ * parm[3 + 5n + 1] = instance_count
+ * parm[3 + 5n + 2] = start
+ * parm[3 + 5n + 3] = index_bias
+ * parm[3 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
+ * SCRATCH[2] = draws left
+ */
+.section #mme9097_draw_elts_indirect_count
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #deic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz $r3 #deic_runout
+   send $r5
+deic_draw_again:
+   parm $r3 /* count */
+   parm $r2 /* instance_count */
+   parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
+   parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   braz $r2 #deic_end
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
+   send $r4
+   send $r5
+   maddr 0x446
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+deic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x5f8 /* INDEX_BATCH_COUNT */
+   send $r3 /* count */
+   mov $r2 (sub $r2 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r2 #deic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+deic_end:
+   read $r5 0xd02
+   mov $r5 (add $r5 -1)
+   braz $r5 #deic_runout_check
+   mov $r7 (add $r7 -1)
+   maddr 0xd02
+   send $r5
+   branz $r7 #deic_draw_again
+   mov $r6 (add $r6 1)
+deic_restore:
+   read $r6 0xd00
+   read $r7 0xd01
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
+   send $r6
+   send $r7
+   exit maddr 0x446
+   send $r6
+deic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+deic_runout_check:
+   branz annul $r7 #deic_runout
+   bra annul #deic_restore
+
+/* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT:
+ *
+ * NOTE: Saves and restores VB_INSTANCE_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 4n + 0] = count
+ * parm[3 + 4n + 1] = instance_count
+ * parm[3 + 4n + 2] = start
+ * parm[3 + 4n + 3] = start_instance
+ *
+ * SCRATCH[0] = VB_INSTANCE_BASE
+ */
+.section #mme9097_draw_arrays_indirect_count
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   maddr 0xd00
+   parm $r6 send $r5 /* start_drawid, save VB_INSTANCE_BASE */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #daic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz annul $r3 #daic_runout
+daic_draw_again:
+   parm $r2 /* count */
+   parm $r3 /* instance_count */
+   parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
+   braz $r3 #daic_end
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x50e /* VB_INSTANCE_BASE */
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+daic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x35e /* VERTEX_BUFFER_COUNT */
+   send $r2
+   mov $r3 (sub $r3 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r3 #daic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+daic_end:
+   mov $r5 (add $r5 -1)
+   braz $r5 #daic_runout_check
+   mov $r7 (add $r7 -1)
+   branz $r7 #daic_draw_again
+   mov $r6 (add $r6 1)
+daic_restore:
+   read $r5 0xd00
+   exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+daic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+daic_runout_check:
+   branz annul $r7 #daic_runout
+   bra annul #daic_restore
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index 0aebeeb6e66..bf8625e0584 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -207,3 +207,128 @@ uint32_t mme9097_draw_arrays_indirect[] = {
 	0x014380a1,
 	0x00002841,
 };
+
+uint32_t mme9097_draw_elts_indirect_count[] = {
+	0x01434615,
+	0x01438715,
+	0x07400021,
+/* 0x000d: deic_draw_again */
+	0x00003041,
+	0x00003841,
+	0x00000601,
+	0x00000701,
+/* 0x001e: deic_again */
+	0x00000501,
+	0x0005ad10,
+/* 0x0026: deic_end */
+	0x000b2807,
+	0x007f4312,
+/* 0x002e: deic_restore */
+	0x000a9817,
+	0x00002841,
+/* 0x0035: deic_runout */
+	0x00000301,
+/* 0x003b: deic_runout_check */
+	0x00000201,
+	0x017dc451,
+	0x00002431,
+	0x0638c021,
+	0x00600041,
+	0x0004d007,
+	0x00002531,
+	0x00002841,
+	0x00003041,
+	0x05434021,
+	0x00002041,
+	0x00002841,
+	0x01118021,
+	0x00002041,
+	0x00004411,
+	0xd0400912,
+	0x01618021,
+	0x00000841,
+	0x017e0021,
+	0x00001841,
+	0x00051210,
+	0x01614071,
+	0xfffe9017,
+	0xd0410912,
+	0x03408515,
+	0xffffed11,
+	0x0004e807,
+	0xffffff11,
+	0x03408021,
+	0x00002841,
+	0xfff87817,
+	0x00007611,
+	0x03400615,
+	0x03404715,
+	0x05434021,
+	0x00003041,
+	0x00003841,
+	0x011180a1,
+	0x00003041,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0xffffff11,
+	0xfffeb837,
+	0xfffc8027,
+};
+
+uint32_t mme9097_draw_arrays_indirect_count[] = {
+	0x01438515,
+	0x03400021,
+/* 0x0009: daic_draw_again */
+	0x00002e31,
+	0x00000701,
+	0x00000501,
+/* 0x0017: daic_again */
+	0x0005ad10,
+	0x00086807,
+/* 0x001f: daic_end */
+	0x007f4312,
+	0x0007d837,
+/* 0x0024: daic_restore */
+/* 0x0027: daic_runout */
+	0x00000201,
+	0x00000301,
+/* 0x002c: daic_runout_check */
+	0x00d74451,
+	0x0004d807,
+	0x00002431,
+	0x0638c021,
+	0x00600041,
+	0x00000041,
+	0x00002041,
+	0x00003041,
+	0x01438021,
+	0x00002041,
+	0x00004411,
+	0xd0400912,
+	0x01618021,
+	0x00000841,
+	0x00d78021,
+	0x00001041,
+	0x00051b10,
+	0x01614071,
+	0xfffe9817,
+	0xd0410912,
+	0xffffed11,
+	0x00032807,
+	0xffffff11,
+	0xfff9f817,
+	0x00007611,
+	0x03400515,
+	0x014380a1,
+	0x00002841,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0x00000201,
+	0xffffff11,
+	0xfffef837,
+	0xfffdc027,
+};
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
index bf2798a44a0..27c026b8b30 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -29,4 +29,8 @@
 
 #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT			0x00003840
 
+#define NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT		0x00003848
+
+#define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT		0x00003850
+
 #endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 86bd8632d0b..c8510b8bb5a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -187,6 +187,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -209,7 +210,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -1029,6 +1029,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
 
    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
    PUSH_DATA (push, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 1d889b9db0d..ad79d1cbb9c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -807,12 +807,16 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nv04_resource *buf = nv04_resource(info->indirect);
+   struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
    unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
    uint32_t offset = buf->offset + info->indirect_offset;
 
    /* must make FIFO wait for engines idle before continuing to process */
-   if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
+   if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) ||
+       (buf_count && buf_count->fence_wr &&
+        !nouveau_fence_signalled(buf_count->fence_wr))) {
       IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);
+   }
 
    /* Queue things up to let the macros write params to the driver constbuf */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
@@ -824,7 +828,10 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
       assert(nvc0->idxbuf.buffer);
       assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
       size = 5;
-      macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
    } else {
       if (nvc0->state.index_bias) {
          /* index_bias is implied 0 if !info->indexed (really ?) */
@@ -833,7 +840,10 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          nvc0->state.index_bias = 0;
       }
       size = 4;
-      macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT;
    }
 
    /* If the stride is not the natural stride, we have to stick a separate
@@ -851,12 +861,21 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          pushes = draws;
       }
 
-      nouveau_pushbuf_space(push, 8, 0, pushes);
+      nouveau_pushbuf_space(push, 16, 0, pushes + !!buf_count);
       PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-      PUSH_DATA(push, NVC0_FIFO_PKHDR_1I(0, macro, 3 + draws * size));
+      if (buf_count)
+         PUSH_REFN(push, buf_count->bo, NOUVEAU_BO_RD | buf_count->domain);
+      PUSH_DATA(push,
+                NVC0_FIFO_PKHDR_1I(0, macro, 3 + !!buf_count + draws * size));
       PUSH_DATA(push, nvc0_prim_gl(info->mode));
       PUSH_DATA(push, drawid);
       PUSH_DATA(push, draws);
+      if (buf_count) {
+         nouveau_pushbuf_data(push,
+                              buf_count->bo,
+                              buf_count->offset + info->indirect_params_offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | 4);
+      }
       if (pushes == 1) {
          nouveau_pushbuf_data(push,
                               buf->bo, offset,

From b0d4ee520e20444172d088d11260e656fc1cf12d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Wed, 6 Jan 2016 15:30:37 -0800
Subject: [PATCH 201/241] nir/opcodes: Fix up uadd_carry and usub_borrow

Both were defined as returning bool but the gpu_shader5 functions are
defined to return int.  Also, we had the parameters for usub borrwo
backwards in the folding expression.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/nir/nir_opcodes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 1cd01a4fe92..d31507fe531 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -361,12 +361,12 @@ binop("udiv", tuint, "", "src0 / src1")
 # returns a boolean representing the carry resulting from the addition of
 # the two unsigned arguments.
 
-binop_convert("uadd_carry", tbool, tuint, commutative, "src0 + src1 < src0")
+binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 
 # returns a boolean representing the borrow resulting from the subtraction
 # of the two unsigned arguments.
 
-binop_convert("usub_borrow", tbool, tuint, "", "src1 < src0")
+binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 
 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")

From d00abcc28376116554799d403211367470dff200 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Wed, 6 Jan 2016 15:30:38 -0800
Subject: [PATCH 202/241] nir/algebraic: Add more lowering

This commit adds lowering options for the following opcodes:

 - nir_op_fmod
 - nir_op_bitfield_insert
 - nir_op_uadd_carry
 - nir_op_usub_borrow

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/nir/nir.h                | 4 ++++
 src/glsl/nir/nir_opt_algebraic.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index fed8a973416..23aec694d95 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1446,6 +1446,10 @@ typedef struct nir_shader_compiler_options {
    bool lower_fpow;
    bool lower_fsat;
    bool lower_fsqrt;
+   bool lower_fmod;
+   bool lower_bitfield_insert;
+   bool lower_uadd_carry;
+   bool lower_usub_borrow;
    /** lowers fneg and ineg to fsub and isub. */
    bool lower_negate;
    /** lowers fsub and isub to fadd+fneg and iadd+ineg. */
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index c553de577ee..1eb044a5a20 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -222,6 +222,12 @@ optimizations = [
    (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
    (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
    (('iabs', ('isub', 0, a)), ('iabs', a)),
+
+   # Misc. lowering
+   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+   (('bitfield_insert', a, b, c, d), ('bfi', ('bfm', d, c), b, a), 'options->lower_bitfield_insert'),
+   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
+   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
 ]
 
 # Add optimizations to handle the case where the result of a ternary is

From 040e314143f973968169bab8ef379bac68fc8626 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Wed, 6 Jan 2016 15:30:39 -0800
Subject: [PATCH 203/241] i965/compiler: Enable more lowering in NIR

We don't need these for GLSL or ARB, but we need them for SPIR-V

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 4bd24a70b55..49ff835fa85 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -104,6 +104,13 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
     */
    nir_options->lower_ffma = true;
    nir_options->lower_sub = true;
+   nir_options->lower_fdiv = true;
+   nir_options->lower_scmp = true;
+   nir_options->lower_fmod = true;
+   nir_options->lower_bitfield_insert = true;
+   nir_options->lower_uadd_carry = true;
+   nir_options->lower_usub_borrow = true;
+
    /* In the vec4 backend, our dpN instruction replicates its result to all
     * the components of a vec4.  We would like NIR to give us replicated fdot
     * instructions because it can optimize better for us.

From d6db7ceedf60622a3c17334d764ca1bcddb5935a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 7 Jan 2016 15:27:52 -0500
Subject: [PATCH 204/241] mesa: check that internalformat of CopyTexImage*D is
 not 1, 2, 3, 4

The piglit copyteximage check has recently been augmented to test this, but
apparently it hasn't been fixed in Mesa so far.

This language also already appears in the OpenGL 2.1 spec (Ian).

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/teximage.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 73b3318e948..50141be8693 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2247,6 +2247,22 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
                      _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
+   } else {
+      /*
+       * Section 8.6 (Alternate Texture Image Specification Commands) of the
+       * OpenGL 4.5 (Compatibility Profile) spec says:
+       *
+       *     "Parameters level, internalformat, and border are specified using
+       *     the same values, with the same meanings, as the corresponding
+       *     arguments of TexImage2D, except that internalformat may not be
+       *     specified as 1, 2, 3, or 4."
+       */
+      if (internalFormat >= 1 && internalFormat <= 4) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glCopyTexImage%dD(internalFormat=%d)", dimensions,
+                     internalFormat);
+         return GL_TRUE;
+      }
    }
 
    baseFormat = _mesa_base_tex_format(ctx, internalFormat);

From b42254eff3d982c6cf649e37029c928290d781fd Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Fri, 8 Jan 2016 03:44:45 +1100
Subject: [PATCH 205/241] gallium/aux: Use TGSI chan name defines inplace of
 literals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_util.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 653e650dc4c..5fff3f0787f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -29,6 +29,7 @@
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
 #include "tgsi_util.h"
+#include "tgsi_exec.h"
 
 union pointer_hack
 {
@@ -53,17 +54,17 @@ tgsi_util_get_src_register_swizzle(
    const struct tgsi_src_register *reg,
    unsigned component )
 {
-   switch( component ) {
-   case 0:
+   switch (component) {
+   case TGSI_CHAN_X:
       return reg->SwizzleX;
-   case 1:
+   case TGSI_CHAN_Y:
       return reg->SwizzleY;
-   case 2:
+   case TGSI_CHAN_Z:
       return reg->SwizzleZ;
-   case 3:
+   case TGSI_CHAN_W:
       return reg->SwizzleW;
    default:
-      assert( 0 );
+      assert(0);
    }
    return 0;
 }

From cb513485a0afba543a2bedb2ecd4d9ea979b5f9a Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Fri, 8 Jan 2016 03:44:46 +1100
Subject: [PATCH 206/241] radeon, si: Use TGSI chan name defines in
 lp_build_emit_fetch() calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/r600_llvm.c     |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 7f436067551..8b91372f3ae 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -726,7 +726,7 @@ static void tex_fetch_args(
 		 * That operand should be passed as a float value in the args array
 		 * right after the coord vector. After packing it's not used anymore,
 		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 	}
 
 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 97645315049..44b03238919 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2453,7 +2453,7 @@ static void tex_fetch_args(
 		emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 		emit_data->args[0] = res;
 		emit_data->args[1] = bld_base->uint_bld.zero;
-		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
+		emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
 		emit_data->arg_count = 3;
 		return;
 	}
@@ -2502,12 +2502,12 @@ static void tex_fetch_args(
 	if (opcode == TGSI_OPCODE_TXB)
 		address[count++] = coords[3];
 	if (opcode == TGSI_OPCODE_TXB2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	/* Pack depth comparison value */
 	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
 		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+			address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 		} else {
 			assert(ref_pos >= 0);
 			address[count++] = coords[ref_pos];
@@ -2578,7 +2578,7 @@ static void tex_fetch_args(
 	if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
 		address[count++] = coords[3];
 	else if (opcode == TGSI_OPCODE_TXL2)
-		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
 
 	if (count > 16) {
 		assert(!"Cannot handle more than 16 texture address parameters");
@@ -3071,10 +3071,10 @@ static void interp_fetch_args(
 		/* offset is in second src, first two channels */
 		emit_data->args[0] = lp_build_emit_fetch(bld_base,
 							 emit_data->inst, 1,
-							 0);
+							 TGSI_CHAN_X);
 		emit_data->args[1] = lp_build_emit_fetch(bld_base,
 							 emit_data->inst, 1,
-							 1);
+							 TGSI_CHAN_Y);
 		emit_data->arg_count = 2;
 	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
 		LLVMValueRef sample_position;
@@ -3085,7 +3085,7 @@ static void interp_fetch_args(
 		 * and place into first two channels.
 		 */
 		sample_id = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 1, 0);
+						emit_data->inst, 1, TGSI_CHAN_X);
 		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
 					     LLVMInt32TypeInContext(gallivm->context),
 					     "");

From 91e8f2b0a58c1f5a00f447f60213e9ec2f5b6e6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 2 Jan 2016 19:58:26 +0100
Subject: [PATCH 207/241] st/mesa: remove dead code from mesa_to_tgsi

These aren't part of ARB_fragment_program.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/mesa/state_tracker/st_mesa_to_tgsi.c | 51 ------------------------
 1 file changed, 51 deletions(-)

diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 4b9dc994ea5..d8f7b6c0725 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -475,24 +475,6 @@ static void emit_swz( struct st_translate *t,
 }
 
 
-/**
- * Negate the value of DDY to match GL semantics where (0,0) is the
- * lower-left corner of the window.
- * Note that the GL_ARB_fragment_coord_conventions extension will
- * effect this someday.
- */
-static void emit_ddy( struct st_translate *t,
-                      struct ureg_dst dst,
-                      const struct prog_src_register *SrcReg )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_src src = translate_src( t, SrcReg );
-   src = ureg_negate( src );
-   ureg_DDY( ureg, dst, src );
-}
-
-
-
 static unsigned
 translate_opcode( unsigned op )
 {
@@ -714,10 +696,6 @@ compile_instruction(
        */
       ureg_MOV( ureg, dst[0], ureg_imm1f(ureg, 0.5) );
       break;
-		 
-   case OPCODE_DDY:
-      emit_ddy( t, dst[0], &inst->SrcReg[0] );
-      break;
 
    case OPCODE_RSQ:
       ureg_RSQ( ureg, dst[0], ureg_abs(src[0]) );
@@ -925,31 +903,6 @@ emit_wpos(struct st_context *st,
 }
 
 
-/**
- * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
- * TGSI uses +1 for front, -1 for back.
- * This function converts the TGSI value to the GL value.  Simply clamping/
- * saturating the value to [0,1] does the job.
- */
-static void
-emit_face_var( struct st_translate *t,
-               const struct gl_program *program )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_dst face_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];
-
-   /* MOV_SAT face_temp, input[face]
-    */
-   face_temp = ureg_saturate( face_temp );
-   ureg_MOV( ureg, face_temp, face_input );
-
-   /* Use face_temp as face input from here on:
-    */
-   t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
-}
-
-
 /**
  * Translate Mesa program to TGSI format.
  * \param program  the program to translate
@@ -1020,10 +973,6 @@ st_translate_mesa_program(
          emit_wpos(st_context(ctx), t, program, ureg);
       }
 
-      if (program->InputsRead & VARYING_BIT_FACE) {
-         emit_face_var( t, program );
-      }
-
       /*
        * Declare output attributes.
        */

From c886422656f38593e1db3700ae747058f55125d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 7 Jan 2016 23:25:48 +0100
Subject: [PATCH 208/241] tgsi/ureg: remove index parameter from
 ureg_DECL_system_value

It can be trivially derived from the number of already declared system
values. This allows ureg users not to worry about which index to choose.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/auxiliary/tgsi/tgsi_ureg.c     | 12 ++++++------
 src/gallium/auxiliary/tgsi/tgsi_ureg.h     |  1 -
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  8 +++++---
 src/mesa/state_tracker/st_mesa_to_tgsi.c   |  8 +++++---
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 4aaf8dfe6d8..964272386b3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -115,7 +115,6 @@ struct ureg_program
    unsigned vs_inputs[PIPE_MAX_ATTRIBS/32];
 
    struct {
-      unsigned index;
       unsigned semantic_name;
       unsigned semantic_index;
    } system_value[UREG_MAX_SYSTEM_VALUE];
@@ -320,20 +319,21 @@ ureg_DECL_input(struct ureg_program *ureg,
 
 struct ureg_src
 ureg_DECL_system_value(struct ureg_program *ureg,
-                       unsigned index,
                        unsigned semantic_name,
                        unsigned semantic_index)
 {
+   unsigned i = 0;
+
    if (ureg->nr_system_values < UREG_MAX_SYSTEM_VALUE) {
-      ureg->system_value[ureg->nr_system_values].index = index;
       ureg->system_value[ureg->nr_system_values].semantic_name = semantic_name;
       ureg->system_value[ureg->nr_system_values].semantic_index = semantic_index;
+      i = ureg->nr_system_values;
       ureg->nr_system_values++;
    } else {
       set_bad(ureg);
    }
 
-   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, index);
+   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, i);
 }
 
 
@@ -1587,8 +1587,8 @@ static void emit_decls( struct ureg_program *ureg )
    for (i = 0; i < ureg->nr_system_values; i++) {
       emit_decl_semantic(ureg,
                          TGSI_FILE_SYSTEM_VALUE,
-                         ureg->system_value[i].index,
-                         ureg->system_value[i].index,
+                         i,
+                         i,
                          ureg->system_value[i].semantic_name,
                          ureg->system_value[i].semantic_index,
                          TGSI_WRITEMASK_XYZW, 0);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 0aae550d60a..5f15ebac517 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -221,7 +221,6 @@ ureg_DECL_input(struct ureg_program *,
 
 struct ureg_src
 ureg_DECL_system_value(struct ureg_program *,
-                       unsigned index,
                        unsigned semantic_name,
                        unsigned semantic_index);
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 2adb57d11ad..6cbc26acf79 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5370,11 +5370,13 @@ st_translate_program(
     */
    {
       GLbitfield sysInputs = proginfo->SystemValuesRead;
-      unsigned numSys = 0;
+
       for (i = 0; sysInputs; i++) {
          if (sysInputs & (1 << i)) {
             unsigned semName = _mesa_sysval_to_semantic[i];
-            t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+
+            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
+
             if (semName == TGSI_SEMANTIC_INSTANCEID ||
                 semName == TGSI_SEMANTIC_VERTEXID) {
                /* From Gallium perspective, these system values are always
@@ -5395,7 +5397,7 @@ st_translate_program(
                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
                }
             }
-            numSys++;
+
             sysInputs &= ~(1 << i);
          }
       }
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index d8f7b6c0725..20dc3d15df7 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -1049,11 +1049,13 @@ st_translate_mesa_program(
     */
    {
       GLbitfield sysInputs = program->SystemValuesRead;
-      unsigned numSys = 0;
+
       for (i = 0; sysInputs; i++) {
          if (sysInputs & (1 << i)) {
             unsigned semName = _mesa_sysval_to_semantic[i];
-            t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+
+            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
+
             if (semName == TGSI_SEMANTIC_INSTANCEID ||
                 semName == TGSI_SEMANTIC_VERTEXID) {
                /* From Gallium perspective, these system values are always
@@ -1074,7 +1076,7 @@ st_translate_mesa_program(
                   t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
                }
             }
-            numSys++;
+
             sysInputs &= ~(1 << i);
          }
       }

From c07cf5f5a92d3c7d433a05a06faf9d262a732f4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 7 Jan 2016 23:37:53 +0100
Subject: [PATCH 209/241] tgsi/ureg: handle redundant declarations in
 ureg_DECL_system_value

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/gallium/auxiliary/tgsi/tgsi_ureg.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 964272386b3..5b78542413e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -322,7 +322,14 @@ ureg_DECL_system_value(struct ureg_program *ureg,
                        unsigned semantic_name,
                        unsigned semantic_index)
 {
-   unsigned i = 0;
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_system_values; i++) {
+      if (ureg->system_value[i].semantic_name == semantic_name &&
+          ureg->system_value[i].semantic_index == semantic_index) {
+         goto out;
+      }
+   }
 
    if (ureg->nr_system_values < UREG_MAX_SYSTEM_VALUE) {
       ureg->system_value[ureg->nr_system_values].semantic_name = semantic_name;
@@ -333,6 +340,7 @@ ureg_DECL_system_value(struct ureg_program *ureg,
       set_bad(ureg);
    }
 
+out:
    return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, i);
 }
 

From 4191c1a57c1e806a078bfc5b074b557ff2b54c35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 2 Jan 2016 20:16:16 +0100
Subject: [PATCH 210/241] glsl: optionally declare gl_FragCoord &
 gl_FrontFacing as system values

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/glsl/ast_to_hir.cpp                    |  2 +-
 src/glsl/builtin_variables.cpp             | 12 ++++++++++--
 src/glsl/nir/shader_enums.h                |  3 ++-
 src/mesa/main/mtypes.h                     |  4 ++++
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  1 +
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index e6aec3654b8..f3966d7e3f4 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3517,7 +3517,7 @@ get_variable_being_redeclared(ir_variable *var, YYLTYPE loc,
               state->is_version(150, 0))
               && strcmp(var->name, "gl_FragCoord") == 0
               && earlier->type == var->type
-              && earlier->data.mode == var->data.mode) {
+              && var->data.mode == ir_var_shader_in) {
       /* Allow redeclaration of gl_FragCoord for ARB_fcc layout
        * qualifiers.
        */
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index e82c99ee3bb..221aab0043b 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -1057,8 +1057,16 @@ builtin_variable_generator::generate_fs_special_vars()
 {
    ir_variable *var;
 
-   add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
-   add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
+   if (this->state->ctx->Const.GLSLFragCoordIsSysVal)
+      add_system_value(SYSTEM_VALUE_FRAG_COORD, vec4_t, "gl_FragCoord");
+   else
+      add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
+
+   if (this->state->ctx->Const.GLSLFrontFacingIsSysVal)
+      add_system_value(SYSTEM_VALUE_FRONT_FACE, bool_t, "gl_FrontFacing");
+   else
+      add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
+
    if (state->is_version(120, 100))
       add_input(VARYING_SLOT_PNTC, vec2_t, "gl_PointCoord");
 
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h
index 0be217c0cf7..8a2a81a333d 100644
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -412,7 +412,8 @@ typedef enum
     * \name Fragment shader system values
     */
    /*@{*/
-   SYSTEM_VALUE_FRONT_FACE,     /**< (not done yet) */
+   SYSTEM_VALUE_FRAG_COORD,
+   SYSTEM_VALUE_FRONT_FACE,
    SYSTEM_VALUE_SAMPLE_ID,
    SYSTEM_VALUE_SAMPLE_POS,
    SYSTEM_VALUE_SAMPLE_MASK_IN,
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index dd52368ef65..41f5283679d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3515,6 +3515,10 @@ struct gl_constants
     */
    GLboolean GLSLSkipStrictMaxUniformLimitCheck;
 
+   /** Whether gl_FragCoord and gl_FrontFacing are system values. */
+   bool GLSLFragCoordIsSysVal;
+   bool GLSLFrontFacingIsSysVal;
+
    /**
     * Always use the GetTransformFeedbackVertexCount() driver hook, rather
     * than passing the transform feedback object to the drawing function.
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 6cbc26acf79..9308eb4841e 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4459,6 +4459,7 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
 
    /* Fragment shader
     */
+   TGSI_SEMANTIC_POSITION,
    TGSI_SEMANTIC_FACE,
    TGSI_SEMANTIC_SAMPLEID,
    TGSI_SEMANTIC_SAMPLEPOS,

From 24737f2298619844685e7deceaeb8dbfc2165ee3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 2 Jan 2016 23:08:27 +0100
Subject: [PATCH 211/241] program: add a helper for rewriting FP position input
 to sysval

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/program/programopt.c | 27 +++++++++++++++++++++++++++
 src/mesa/program/programopt.h |  2 ++
 2 files changed, 29 insertions(+)

diff --git a/src/mesa/program/programopt.c b/src/mesa/program/programopt.c
index af78150d594..24dde57725e 100644
--- a/src/mesa/program/programopt.c
+++ b/src/mesa/program/programopt.c
@@ -589,3 +589,30 @@ _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type)
       }
    }
 }
+
+void
+_mesa_program_fragment_position_to_sysval(struct gl_program *prog)
+{
+   GLuint i;
+
+   if (prog->Target != GL_FRAGMENT_PROGRAM_ARB ||
+       !(prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_POS)))
+      return;
+
+   prog->InputsRead &= ~BITFIELD64_BIT(VARYING_SLOT_POS);
+   prog->SystemValuesRead |= 1 << SYSTEM_VALUE_FRAG_COORD;
+
+   for (i = 0; i < prog->NumInstructions; i++) {
+      struct prog_instruction *inst = prog->Instructions + i;
+      const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
+      GLuint j;
+
+      for (j = 0; j < numSrc; j++) {
+         if (inst->SrcReg[j].File == PROGRAM_INPUT &&
+             inst->SrcReg[j].Index == VARYING_SLOT_POS) {
+            inst->SrcReg[j].File = PROGRAM_SYSTEM_VALUE;
+            inst->SrcReg[j].Index = SYSTEM_VALUE_FRAG_COORD;
+         }
+      }
+   }
+}
diff --git a/src/mesa/program/programopt.h b/src/mesa/program/programopt.h
index 757421edfe1..1520d161ea8 100644
--- a/src/mesa/program/programopt.h
+++ b/src/mesa/program/programopt.h
@@ -51,6 +51,8 @@ _mesa_count_texture_instructions(struct gl_program *prog);
 extern void
 _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type);
 
+extern void
+_mesa_program_fragment_position_to_sysval(struct gl_program *prog);
 
 #ifdef __cplusplus
 }

From 34738a92dea31ab91edb62bf83a3fe1ca44c35a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 2 Jan 2016 20:45:00 +0100
Subject: [PATCH 212/241] gallium: add caps for POSITION and FACE system values

v2: document the integer behavior

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/docs/source/screen.rst            |  5 +++++
 src/gallium/docs/source/tgsi.rst              | 19 +++++++++++++------
 .../drivers/freedreno/freedreno_screen.c      |  2 ++
 src/gallium/drivers/i915/i915_screen.c        |  2 ++
 src/gallium/drivers/ilo/ilo_screen.c          |  2 ++
 src/gallium/drivers/llvmpipe/lp_screen.c      |  2 ++
 .../drivers/nouveau/nv30/nv30_screen.c        |  2 ++
 .../drivers/nouveau/nv50/nv50_screen.c        |  2 ++
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  2 ++
 src/gallium/drivers/r300/r300_screen.c        |  2 ++
 src/gallium/drivers/r600/r600_pipe.c          |  2 ++
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 ++
 src/gallium/drivers/softpipe/sp_screen.c      |  2 ++
 src/gallium/drivers/svga/svga_screen.c        |  2 ++
 src/gallium/drivers/vc4/vc4_screen.c          |  2 ++
 src/gallium/drivers/virgl/virgl_screen.c      |  2 ++
 src/gallium/include/pipe/p_defines.h          |  2 ++
 17 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 39ecc6334eb..fc08bb9ac34 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -292,6 +292,11 @@ The integer capabilities:
   supported in vertex shaders.
 * ``PIPE_CAP_TGSI_PACK_HALF_FLOAT``: Whether the ``UP2H`` and ``PK2H``
   TGSI opcodes are supported.
+* ``PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL``: If state trackers should use
+  a system value for the POSITION fragment shader input.
+* ``PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL``: If state trackers should use
+  a system value for the FACE fragment shader input.
+  Also, the FACE system value is integer, not float.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 2149d08419a..8fe971b2f7a 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2658,7 +2658,8 @@ space coordinate system.  After clipping, the X, Y and Z components of the
 vertex will be divided by the W value to get normalized device coordinates.
 
 For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
-fragment shader input contains the fragment's window position.  The X
+fragment shader input (or system value, depending on which one is
+supported by the driver) contains the fragment's window position.  The X
 component starts at zero and always increases from left to right.
 The Y component starts at zero and always increases but Y=0 may either
 indicate the top of the window or the bottom depending on the fragment
@@ -2770,11 +2771,17 @@ typically only used for legacy graphics APIs.
 TGSI_SEMANTIC_FACE
 """"""""""""""""""
 
-This label applies to fragment shader inputs only and indicates that
-the register contains front/back-face information of the form (F, 0,
-0, 1).  The first component will be positive when the fragment belongs
-to a front-facing polygon, and negative when the fragment belongs to a
-back-facing polygon.
+This label applies to fragment shader inputs (or system values,
+depending on which one is supported by the driver) and indicates that
+the register contains front/back-face information.
+
+If it is an input, it will be a floating-point vector in the form (F, 0, 0, 1),
+where F will be positive when the fragment belongs to a front-facing polygon,
+and negative when the fragment belongs to a back-facing polygon.
+
+If it is a system value, it will be an integer vector in the form (F, 0, 0, 1),
+where F is 0xffffffff when the fragment belongs to a front-facing polygon and
+0 when the fragment belongs to a back-facing polygon.
 
 
 TGSI_SEMANTIC_EDGEFLAG
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index a8030f2ff60..e940b1c21e6 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -242,6 +242,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index f42fc37abe5..2289eb58c49 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -256,6 +256,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 3a18e7415b9..c26d4492d3a 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -480,6 +480,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 0898cff3a2e..1407b2688de 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -305,6 +305,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 6c4a0f31ab6..e63767d8aa0 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -178,6 +178,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index d6131c2f994..68d2acd4bcd 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -221,6 +221,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index c8510b8bb5a..e4749eed7ab 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -210,6 +210,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 5e67a2ffd15..a376590ab61 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -204,6 +204,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_CLEAR_TEXTURE:
         case PIPE_CAP_DRAW_PARAMETERS:
         case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 563a892c4a2..9e5824202aa 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -352,6 +352,8 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 64f28d9c7be..50b23472467 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -344,6 +344,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index e74044b5b0b..36510d5eb40 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -255,6 +255,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 17781118503..4285b1c2e00 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -388,6 +388,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 8bbacc60d41..58f8ad9d510 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -193,6 +193,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 05ce58415e2..bf048da42a3 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -223,6 +223,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 591bbf32556..8e48528944e 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -639,6 +639,8 @@ enum pipe_cap
    PIPE_CAP_TGSI_PACK_HALF_FLOAT,
    PIPE_CAP_MULTI_DRAW_INDIRECT,
    PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS,
+   PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL,
+   PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From c00e534283d26fd66e2f52962cf55435dc7a4597 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 8 Jan 2016 01:45:34 +0100
Subject: [PATCH 213/241] tgsi/scan: update for POSITION and FACE sytem values

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 6ea32eedd74..7a02e27e01e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -373,7 +373,10 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                      info->uses_primid = TRUE;
                   } else if (semName == TGSI_SEMANTIC_INVOCATIONID) {
                      info->uses_invocationid = TRUE;
-                  }
+                  } else if (semName == TGSI_SEMANTIC_POSITION)
+                     info->reads_position = TRUE;
+                  else if (semName == TGSI_SEMANTIC_FACE)
+                     info->uses_frontface = TRUE;
                }
                else if (file == TGSI_FILE_OUTPUT) {
                   info->output_semantic_name[reg] = (ubyte) semName;

From 8a13ce14fd4e29e4e74322c1a3d548960f7f9bc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 2 Jan 2016 22:45:10 +0100
Subject: [PATCH 214/241] st/mesa: add support for POSITION and FACE system
 values

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_extensions.c     |  5 ++++
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 27 +++++++++++++++-------
 src/mesa/state_tracker/st_mesa_to_tgsi.c   | 23 ++++++++++++------
 src/mesa/state_tracker/st_program.c        |  5 +++-
 4 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 91ad169d786..2a3e52362e4 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -327,6 +327,11 @@ void st_init_limits(struct pipe_screen *screen,
          c->Program[MESA_SHADER_FRAGMENT].MaxUniformBlocks;
       assert(c->MaxCombinedUniformBlocks <= MAX_COMBINED_UNIFORM_BUFFERS);
    }
+
+   c->GLSLFragCoordIsSysVal =
+      screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL);
+   c->GLSLFrontFacingIsSysVal =
+      screen->get_param(screen, PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL);
 }
 
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 9308eb4841e..27a0a4f51e1 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4906,10 +4906,11 @@ compile_tgsi_instruction(struct st_translate *t,
  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
  */
 static void
-emit_wpos_adjustment( struct st_translate *t,
-                      int wpos_transform_const,
-                      boolean invert,
-                      GLfloat adjX, GLfloat adjY[2])
+emit_wpos_adjustment(struct gl_context *ctx,
+                     struct st_translate *t,
+                     int wpos_transform_const,
+                     boolean invert,
+                     GLfloat adjX, GLfloat adjY[2])
 {
    struct ureg_program *ureg = t->ureg;
 
@@ -4921,7 +4922,11 @@ emit_wpos_adjustment( struct st_translate *t,
     */
    struct ureg_src wpostrans = ureg_DECL_constant(ureg, wpos_transform_const);
    struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src *wpos =
+      ctx->Const.GLSLFragCoordIsSysVal ?
+         &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
+         &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src wpos_input = *wpos;
 
    /* First, apply the coordinate shift: */
    if (adjX || adjY[0] || adjY[1]) {
@@ -4972,7 +4977,7 @@ emit_wpos_adjustment( struct st_translate *t,
 
    /* Use wpos_temp as position input from here on:
     */
-   t->inputs[t->inputMapping[VARYING_SLOT_POS]] = ureg_src(wpos_temp);
+   *wpos = ureg_src(wpos_temp);
 }
 
 
@@ -5081,7 +5086,7 @@ emit_wpos(struct st_context *st,
 
    /* we invert after adjustment so that we avoid the MOV to temporary,
     * and reuse the adjustment ADD instead */
-   emit_wpos_adjustment(t, wpos_transform_const, invert, adjX, adjY);
+   emit_wpos_adjustment(st->ctx, t, wpos_transform_const, invert, adjX, adjY);
 }
 
 /**
@@ -5399,6 +5404,11 @@ st_translate_program(
                }
             }
 
+            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                semName == TGSI_SEMANTIC_POSITION)
+               emit_wpos(st_context(ctx), t, proginfo, ureg,
+                         program->wpos_transform_const);
+
             sysInputs &= ~(1 << i);
          }
       }
@@ -5685,7 +5695,8 @@ get_mesa_program(struct gl_context *ctx,
 
    /* This must be done before the uniform storage is associated. */
    if (shader->Type == GL_FRAGMENT_SHADER &&
-       prog->InputsRead & VARYING_BIT_POS){
+       (prog->InputsRead & VARYING_BIT_POS ||
+        prog->SystemValuesRead & (1 << SYSTEM_VALUE_FRAG_COORD))) {
       static const gl_state_index wposTransformState[STATE_LENGTH] = {
          STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
       };
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 20dc3d15df7..be47823a048 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -717,10 +717,11 @@ compile_instruction(
  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
  */
 static void
-emit_wpos_adjustment( struct st_translate *t,
-                      const struct gl_program *program,
-                      boolean invert,
-                      GLfloat adjX, GLfloat adjY[2])
+emit_wpos_adjustment(struct gl_context *ctx,
+                     struct st_translate *t,
+                     const struct gl_program *program,
+                     boolean invert,
+                     GLfloat adjX, GLfloat adjY[2])
 {
    struct ureg_program *ureg = t->ureg;
 
@@ -740,7 +741,11 @@ emit_wpos_adjustment( struct st_translate *t,
 
    struct ureg_src wpostrans = ureg_DECL_constant( ureg, wposTransConst );
    struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src *wpos =
+      ctx->Const.GLSLFragCoordIsSysVal ?
+         &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
+         &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src wpos_input = *wpos;
 
    /* First, apply the coordinate shift: */
    if (adjX || adjY[0] || adjY[1]) {
@@ -791,7 +796,7 @@ emit_wpos_adjustment( struct st_translate *t,
 
    /* Use wpos_temp as position input from here on:
     */
-   t->inputs[t->inputMapping[VARYING_SLOT_POS]] = ureg_src(wpos_temp);
+   *wpos = ureg_src(wpos_temp);
 }
 
 
@@ -899,7 +904,7 @@ emit_wpos(struct st_context *st,
 
    /* we invert after adjustment so that we avoid the MOV to temporary,
     * and reuse the adjustment ADD instead */
-   emit_wpos_adjustment(t, program, invert, adjX, adjY);
+   emit_wpos_adjustment(st->ctx, t, program, invert, adjX, adjY);
 }
 
 
@@ -1077,6 +1082,10 @@ st_translate_mesa_program(
                }
             }
 
+            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                semName == TGSI_SEMANTIC_POSITION)
+               emit_wpos(st_context(ctx), t, program, ureg);
+
             sysInputs &= ~(1 << i);
          }
       }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 39c54c256e0..b3954547418 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -581,8 +581,11 @@ st_translate_fragment_program(struct st_context *st,
 
    memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr));
 
-   if (!stfp->glsl_to_tgsi)
+   if (!stfp->glsl_to_tgsi) {
       _mesa_remove_output_reads(&stfp->Base.Base, PROGRAM_OUTPUT);
+      if (st->ctx->Const.GLSLFragCoordIsSysVal)
+         _mesa_program_fragment_position_to_sysval(&stfp->Base.Base);
+   }
 
    /*
     * Convert Mesa program inputs to TGSI input register semantics.

From 69f43c2cc903d5973bab2515be51465c9e8f9f9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 7 Jan 2016 19:48:56 +0100
Subject: [PATCH 215/241] util/pstipple: allow fragment shader POSITION to be a
 system value

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/util/u_pstipple.c       | 30 +++++++++++++++----
 src/gallium/auxiliary/util/u_pstipple.h       |  3 +-
 src/gallium/drivers/radeonsi/si_shader.c      |  3 +-
 .../drivers/softpipe/sp_state_shader.c        |  3 +-
 src/gallium/drivers/svga/svga_tgsi_insn.c     |  3 +-
 src/gallium/drivers/svga/svga_tgsi_vgpu10.c   |  3 +-
 6 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 0bb46ff8dd1..08dec13846d 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -177,6 +177,7 @@ struct pstip_transform_context {
    struct tgsi_shader_info info;
    uint tempsUsed;  /**< bitmask */
    int wincoordInput;
+   unsigned wincoordFile;
    int maxInput;
    uint samplersUsed;  /**< bitfield of samplers used */
    int freeSampler;  /** an available sampler for the pstipple */
@@ -206,7 +207,7 @@ pstip_transform_decl(struct tgsi_transform_context *ctx,
          pctx->samplersUsed |= 1 << i;
       }
    }
-   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+   else if (decl->Declaration.File == pctx->wincoordFile) {
       pctx->maxInput = MAX2(pctx->maxInput, (int) decl->Range.Last);
       if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION)
          pctx->wincoordInput = (int) decl->Range.First;
@@ -275,10 +276,22 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
       wincoordInput = pctx->wincoordInput;
 
    if (pctx->wincoordInput < 0) {
+      struct tgsi_full_declaration decl;
+
+      decl = tgsi_default_full_declaration();
       /* declare new position input reg */
-      tgsi_transform_input_decl(ctx, wincoordInput,
-                                TGSI_SEMANTIC_POSITION, 1,
-                                TGSI_INTERPOLATE_LINEAR);
+      decl.Declaration.File = pctx->wincoordFile;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.Name = TGSI_SEMANTIC_POSITION;
+      decl.Range.First =
+      decl.Range.Last = wincoordInput;
+
+      if (pctx->wincoordFile == TGSI_FILE_INPUT) {
+         decl.Declaration.Interpolate = 1;
+         decl.Interp.Interpolate = TGSI_INTERPOLATE_LINEAR;
+      }
+
+      ctx->emit_declaration(ctx, &decl);
    }
 
    sampIdx = pctx->hasFixedUnit ? pctx->fixedUnit : pctx->freeSampler;
@@ -327,7 +340,7 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
    tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
                            TGSI_FILE_TEMPORARY, texTemp,
                            TGSI_WRITEMASK_XYZW,
-                           TGSI_FILE_INPUT, wincoordInput,
+                           pctx->wincoordFile, wincoordInput,
                            TGSI_FILE_IMMEDIATE, pctx->numImmed);
 
    /* TEX texTemp, texTemp, sampler; */
@@ -351,11 +364,15 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
  *                        will be used to sample the stipple texture;
  *                        if NULL, the fixed unit is used
  * \param fixedUnit       fixed texture unit used for the stipple texture
+ * \param wincoordFile    TGSI_FILE_INPUT or TGSI_FILE_SYSTEM_VALUE,
+ *                        depending on which one is supported by the driver
+ *                        for TGSI_SEMANTIC_POSITION in the fragment shader
  */
 struct tgsi_token *
 util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
                                      unsigned *samplerUnitOut,
-                                     unsigned fixedUnit)
+                                     unsigned fixedUnit,
+                                     unsigned wincoordFile)
 {
    struct pstip_transform_context transform;
    const uint newLen = tgsi_num_tokens(tokens) + NUM_NEW_TOKENS;
@@ -370,6 +387,7 @@ util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
     */
    memset(&transform, 0, sizeof(transform));
    transform.wincoordInput = -1;
+   transform.wincoordFile = wincoordFile;
    transform.maxInput = -1;
    transform.coordOrigin = TGSI_FS_COORD_ORIGIN_UPPER_LEFT;
    transform.hasFixedUnit = !samplerUnitOut;
diff --git a/src/gallium/auxiliary/util/u_pstipple.h b/src/gallium/auxiliary/util/u_pstipple.h
index 249c58be95f..ef8396f4318 100644
--- a/src/gallium/auxiliary/util/u_pstipple.h
+++ b/src/gallium/auxiliary/util/u_pstipple.h
@@ -50,7 +50,8 @@ util_pstipple_create_sampler(struct pipe_context *pipe);
 struct tgsi_token *
 util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
                                      unsigned *samplerUnitOut,
-                                     unsigned fixed_unit);
+                                     unsigned fixed_unit,
+                                     unsigned wincoordFile);
 
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 44b03238919..1db3e484915 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4066,7 +4066,8 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	if (poly_stipple) {
 		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-						SI_POLY_STIPPLE_SAMPLER);
+						SI_POLY_STIPPLE_SAMPLER,
+						TGSI_FILE_INPUT);
 		tgsi_scan_shader(tokens, &stipple_shader_info);
 	}
 
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c
index dce0404de5b..f0d66a53ec6 100644
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -64,7 +64,8 @@ create_fs_variant(struct softpipe_context *softpipe,
          /* get new shader that implements polygon stippling */
          var->tokens = 
             util_pstipple_create_fragment_shader(curfs->tokens,
-                                                 &var->stipple_sampler_unit, 0);
+                                                 &var->stipple_sampler_unit, 0,
+                                                 TGSI_FILE_INPUT);
       }
       else
 #endif
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 970e70aabf9..489e68f88e8 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -3844,7 +3844,8 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
    if (emit->unit == PIPE_SHADER_FRAGMENT && emit->key.fs.pstipple) {
       unsigned unit;
 
-      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                        TGSI_FILE_INPUT);
 
       if (new_tokens) {
          /* Setup texture state for stipple */
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index c5be11f936e..098f6f5a28d 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -6609,7 +6609,8 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
       tgsi_dump(tokens,0);
    }
 
-   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                     TGSI_FILE_INPUT);
 
    emit->fs.pstipple_sampler_unit = unit;
 

From d0cf66d8358448ea752a1d84fdb1503e61b49065 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 7 Jan 2016 23:14:55 +0100
Subject: [PATCH 216/241] vl: allow fragment shader POSITION to be a system
 value

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/vl/vl_mc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/vl/vl_mc.c b/src/gallium/auxiliary/vl/vl_mc.c
index 6c317bbe04a..eb703a90445 100644
--- a/src/gallium/auxiliary/vl/vl_mc.c
+++ b/src/gallium/auxiliary/vl/vl_mc.c
@@ -79,14 +79,18 @@ calc_position(struct vl_mc *r, struct ureg_program *shader, struct ureg_src bloc
 }
 
 static struct ureg_dst
-calc_line(struct ureg_program *shader)
+calc_line(struct pipe_screen *screen, struct ureg_program *shader)
 {
    struct ureg_dst tmp;
    struct ureg_src pos;
 
    tmp = ureg_DECL_temporary(shader);
 
-   pos = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS, TGSI_INTERPOLATE_LINEAR);
+   if (screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL))
+      pos = ureg_DECL_system_value(shader, TGSI_SEMANTIC_POSITION, 0);
+   else
+      pos = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS,
+                               TGSI_INTERPOLATE_LINEAR);
 
    /*
     * tmp.y = fraction(pos.y / 2) >= 0.5 ? 1 : 0
@@ -177,7 +181,7 @@ create_ref_frag_shader(struct vl_mc *r)
 
    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
 
-   field = calc_line(shader);
+   field = calc_line(r->pipe->screen, shader);
 
    /*
     * ref = field.z ? tc[1] : tc[0]
@@ -324,7 +328,7 @@ create_ycbcr_frag_shader(struct vl_mc *r, float scale, bool invert,
 
    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
 
-   tmp = calc_line(shader);
+   tmp = calc_line(r->pipe->screen, shader);
 
    /*
     * if (field == tc.w)

From 1e463d20ba38d0af409b7b9b825b31330f4b4f0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 8 Jan 2016 02:11:16 +0100
Subject: [PATCH 217/241] nine: allow fragment shader POSITION and FACE to be
 system values

Reported-by: Axel Davy <axel.davy@ens.fr>
---
 src/gallium/state_trackers/nine/nine_ff.c     | 10 +++-
 src/gallium/state_trackers/nine/nine_shader.c | 48 ++++++++++++++-----
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index fe26086ef3d..0feaeab7330 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -1391,7 +1391,15 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
     /* Fog.
      */
     if (key->fog_mode) {
-        struct ureg_src vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_INTERPOLATE_LINEAR);
+        struct ureg_src vPos;
+        if (device->screen->get_param(device->screen,
+                                      PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
+            vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+        } else {
+            vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                      TGSI_INTERPOLATE_LINEAR);
+        }
+
         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
         if (key->fog_mode == D3DFOG_EXP) {
             ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 28f27870dc8..ed431738abc 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -462,6 +462,8 @@ struct shader_translator
     boolean lower_preds;
     boolean want_texcoord;
     boolean shift_wpos;
+    boolean wpos_is_sysval;
+    boolean face_is_sysval_integer;
     unsigned texcoord_sn;
 
     struct sm1_instruction insn; /* current instruction */
@@ -945,10 +947,16 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
     case D3DSPR_MISCTYPE:
         switch (param->idx) {
         case D3DSMO_POSITION:
-           if (ureg_src_is_undef(tx->regs.vPos))
-               tx->regs.vPos = ureg_DECL_fs_input(ureg,
-                                                  TGSI_SEMANTIC_POSITION, 0,
-                                                  TGSI_INTERPOLATE_LINEAR);
+           if (ureg_src_is_undef(tx->regs.vPos)) {
+              if (tx->wpos_is_sysval) {
+                  tx->regs.vPos =
+                      ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+              } else {
+                  tx->regs.vPos =
+                      ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
+              }
+           }
            if (tx->shift_wpos) {
                /* TODO: do this only once */
                struct ureg_dst wpos = tx_scratch(tx);
@@ -961,9 +969,20 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
            break;
         case D3DSMO_FACE:
            if (ureg_src_is_undef(tx->regs.vFace)) {
-               tx->regs.vFace = ureg_DECL_fs_input(ureg,
-                                                   TGSI_SEMANTIC_FACE, 0,
-                                                   TGSI_INTERPOLATE_CONSTANT);
+               if (tx->face_is_sysval_integer) {
+                   tmp = tx_scratch(tx);
+                   tx->regs.vFace =
+                       ureg_DECL_system_value(ureg, TGSI_SEMANTIC_FACE, 0);
+
+                   /* convert bool to float */
+                   ureg_UCMP(ureg, tmp, ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X),
+                             ureg_imm1f(ureg, 1), ureg_imm1f(ureg, -1));
+                   tx->regs.vFace = ureg_src(tmp);
+               } else {
+                   tx->regs.vFace = ureg_DECL_fs_input(ureg,
+                                                       TGSI_SEMANTIC_FACE, 0,
+                                                       TGSI_INTERPOLATE_CONSTANT);
+               }
                tx->regs.vFace = ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X);
            }
            src = tx->regs.vFace;
@@ -3259,10 +3278,15 @@ shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
         return;
     }
 
-    if (tx->info->fog_mode != D3DFOG_NONE)
-        depth = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
-                                              TGSI_INTERPOLATE_LINEAR),
-                                              TGSI_SWIZZLE_Z);
+    if (tx->info->fog_mode != D3DFOG_NONE) {
+        if (tx->wpos_is_sysval) {
+            depth = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+        } else {
+            depth = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                       TGSI_INTERPOLATE_LINEAR);
+        }
+        depth = ureg_scalar(depth, TGSI_SWIZZLE_Z);
+    }
 
     nine_info_mark_const_f_used(tx->info, 33);
     fog_color = NINE_CONSTANT_SRC(32);
@@ -3344,6 +3368,8 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
     tx->shift_wpos = !GET_CAP(TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
     tx->texcoord_sn = tx->want_texcoord ?
         TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC;
+    tx->wpos_is_sysval = GET_CAP(TGSI_FS_POSITION_IS_SYSVAL);
+    tx->face_is_sysval_integer = GET_CAP(TGSI_FS_FACE_IS_INTEGER_SYSVAL);
 
     if (IS_VS) {
         tx->num_constf_allowed = NINE_MAX_CONST_F;

From 6613042c4ed4d8ef64fa21ad19a2131dae2f4702 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 7 Jan 2016 17:20:47 +0200
Subject: [PATCH 218/241] configure.ac: add --enable-profile

For profiling mesa's code, especially llvmpipe, PROFILE should be
defined. Currently, this define can only be generated if mesa is
built using scons.
This patch makes it possible to generate this define also when building
mesa through automake tools.

v2:

- Change --enable-llvmpipe-profile to --enable-profile
- Add -fno-omit-frame-pointer to CFLAGS and CXXFLAGS when enabling profile

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 configure.ac | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/configure.ac b/configure.ac
index b1c1d7df4d8..9c3d1a3481e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -476,8 +476,29 @@ AC_ARG_ENABLE([debug],
     [enable_debug="$enableval"],
     [enable_debug=no]
 )
+
+AC_ARG_ENABLE([profile],
+    [AS_HELP_STRING([--enable-profile],
+        [enable profiling of code @<:@default=disabled@:>@])],
+    [enable_profile="$enableval"],
+    [enable_profile=no]
+)
+
+if test "x$enable_profile" = xyes; then
+    DEFINES="$DEFINES -DPROFILE"
+    if test "x$GCC" = xyes; then
+        CFLAGS="$CFLAGS -fno-omit-frame-pointer"
+    fi
+    if test "x$GXX" = xyes; then
+        CXXFLAGS="$CXXFLAGS -fno-omit-frame-pointer"
+    fi
+fi
+
 if test "x$enable_debug" = xyes; then
     DEFINES="$DEFINES -DDEBUG"
+    if test "x$enable_profile" = xyes; then
+        AC_MSG_WARN([Debug and Profile are enabled at the same time])
+    fi
     if test "x$GCC" = xyes; then
         if ! echo "$CFLAGS" | grep -q -e '-g'; then
             CFLAGS="$CFLAGS -g"

From e378184d9c31a4b8f67cf1b75f401f2d5c54782a Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Fri, 8 Jan 2016 13:59:16 +0000
Subject: [PATCH 219/241] mesa/main: Avoid `void function returning a value`
 warning.

Trivial.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/shaderimage.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index c4ebf4201fb..040e9fd6e3c 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -738,8 +738,10 @@ _mesa_MemoryBarrierByRegion(GLbitfield barriers)
        * That is, if barriers is the special value GL_ALL_BARRIER_BITS, then all
        * barriers allowed by glMemoryBarrierByRegion should be activated."
        */
-      if (barriers == GL_ALL_BARRIER_BITS)
-         return ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+      if (barriers == GL_ALL_BARRIER_BITS) {
+         ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+         return;
+      }
 
       /* From section 7.11.2 of the OpenGL ES 3.1 specification:
        *

From 208bfc493debe0344d0b9cb93975981f14412628 Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Fri, 8 Jan 2016 14:03:38 +0000
Subject: [PATCH 220/241] glsl: Ensure 64bits shift is used.

I believe that `1u << x`, where x >= 32 yields undefined results
according to the C standard.

Particularly MSVC says `warning C4334: '<<' : result of 32-bit shift
implicitly converted to 64 bits (was 64-bit shift intended?)`.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/glsl/link_varyings.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 8763cc5b07d..3853abdb8e6 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -1110,8 +1110,8 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
        */
       for (unsigned j = 0; j < num_elements; j++) {
          while ((slot_end < MAX_VARYING * 4u) &&
-                ((reserved_slots & (1u << *location / 4u) ||
-                 (reserved_slots & (1u << slot_end / 4u))))) {
+                ((reserved_slots & (UINT64_C(1) << *location / 4u) ||
+                 (reserved_slots & (UINT64_C(1) << slot_end / 4u))))) {
 
             *location = ALIGN(*location + 1, 4);
             slot_end = *location;
@@ -1529,7 +1529,7 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
          ->count_attribute_slots(stage->Stage == MESA_SHADER_VERTEX);
       for (unsigned i = 0; i < num_elements; i++) {
          if (var_slot >= 0 && var_slot < MAX_VARYING)
-            slots |= 1u << var_slot;
+            slots |= UINT64_C(1) << var_slot;
          var_slot += 1;
       }
    }

From 8cc9a8aa2a97ca9e7a36a993954a3480d44c13d3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 20 Sep 2014 02:54:16 -0400
Subject: [PATCH 221/241] tgsi: add ureg support for image decls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_build.c       | 62 ++++++++-------
 src/gallium/auxiliary/tgsi/tgsi_dump.c        | 10 ++-
 src/gallium/auxiliary/tgsi/tgsi_parse.c       |  4 +-
 src/gallium/auxiliary/tgsi/tgsi_parse.h       |  2 +-
 src/gallium/auxiliary/tgsi/tgsi_strings.c     |  4 +-
 src/gallium/auxiliary/tgsi/tgsi_text.c        | 10 ++-
 src/gallium/auxiliary/tgsi/tgsi_ureg.c        | 77 +++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ureg.h        |  7 ++
 src/gallium/drivers/ilo/shader/toy_tgsi.c     |  8 +-
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 12 ++-
 src/gallium/drivers/svga/svga_tgsi_vgpu10.c   |  2 +
 src/gallium/include/pipe/p_shader_tokens.h    |  7 +-
 12 files changed, 153 insertions(+), 52 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index fdb7febf7ea..bb9d0cbe25d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -259,36 +259,39 @@ tgsi_build_declaration_semantic(
    return ds;
 }
 
-static struct tgsi_declaration_resource
-tgsi_default_declaration_resource(void)
+static struct tgsi_declaration_image
+tgsi_default_declaration_image(void)
 {
-   struct tgsi_declaration_resource dr;
+   struct tgsi_declaration_image di;
 
-   dr.Resource = TGSI_TEXTURE_BUFFER;
-   dr.Raw = 0;
-   dr.Writable = 0;
-   dr.Padding = 0;
+   di.Resource = TGSI_TEXTURE_BUFFER;
+   di.Raw = 0;
+   di.Writable = 0;
+   di.Format = 0;
+   di.Padding = 0;
 
-   return dr;
+   return di;
 }
 
-static struct tgsi_declaration_resource
-tgsi_build_declaration_resource(unsigned texture,
-                                unsigned raw,
-                                unsigned writable,
-                                struct tgsi_declaration *declaration,
-                                struct tgsi_header *header)
+static struct tgsi_declaration_image
+tgsi_build_declaration_image(unsigned texture,
+                             unsigned format,
+                             unsigned raw,
+                             unsigned writable,
+                             struct tgsi_declaration *declaration,
+                             struct tgsi_header *header)
 {
-   struct tgsi_declaration_resource dr;
+   struct tgsi_declaration_image di;
 
-   dr = tgsi_default_declaration_resource();
-   dr.Resource = texture;
-   dr.Raw = raw;
-   dr.Writable = writable;
+   di = tgsi_default_declaration_image();
+   di.Resource = texture;
+   di.Format = format;
+   di.Raw = raw;
+   di.Writable = writable;
 
    declaration_grow(declaration, header);
 
-   return dr;
+   return di;
 }
 
 static struct tgsi_declaration_sampler_view
@@ -364,7 +367,7 @@ tgsi_default_full_declaration( void )
    full_declaration.Range = tgsi_default_declaration_range();
    full_declaration.Semantic = tgsi_default_declaration_semantic();
    full_declaration.Interp = tgsi_default_declaration_interp();
-   full_declaration.Resource = tgsi_default_declaration_resource();
+   full_declaration.Image = tgsi_default_declaration_image();
    full_declaration.SamplerView = tgsi_default_declaration_sampler_view();
    full_declaration.Array = tgsi_default_declaration_array();
 
@@ -454,20 +457,21 @@ tgsi_build_full_declaration(
          header );
    }
 
-   if (full_decl->Declaration.File == TGSI_FILE_RESOURCE) {
-      struct tgsi_declaration_resource *dr;
+   if (full_decl->Declaration.File == TGSI_FILE_IMAGE) {
+      struct tgsi_declaration_image *di;
 
       if (maxsize <= size) {
          return  0;
       }
-      dr = (struct tgsi_declaration_resource *)&tokens[size];
+      di = (struct tgsi_declaration_image *)&tokens[size];
       size++;
 
-      *dr = tgsi_build_declaration_resource(full_decl->Resource.Resource,
-                                            full_decl->Resource.Raw,
-                                            full_decl->Resource.Writable,
-                                            declaration,
-                                            header);
+      *di = tgsi_build_declaration_image(full_decl->Image.Resource,
+                                         full_decl->Image.Format,
+                                         full_decl->Image.Raw,
+                                         full_decl->Image.Writable,
+                                         declaration,
+                                         header);
    }
 
    if (full_decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index e29ffb39894..dad3839d897 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -348,12 +348,14 @@ iter_declaration(
       }
    }
 
-   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
+   if (decl->Declaration.File == TGSI_FILE_IMAGE) {
       TXT(", ");
-      ENM(decl->Resource.Resource, tgsi_texture_names);
-      if (decl->Resource.Writable)
+      ENM(decl->Image.Resource, tgsi_texture_names);
+      TXT(", ");
+      UID(decl->Image.Format);
+      if (decl->Image.Writable)
          TXT(", WR");
-      if (decl->Resource.Raw)
+      if (decl->Image.Raw)
          TXT(", RAW");
    }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 0729b5d2426..9a52bbbf5cb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -121,8 +121,8 @@ tgsi_parse_token(
          next_token( ctx, &decl->Semantic );
       }
 
-      if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
-         next_token(ctx, &decl->Resource);
+      if (decl->Declaration.File == TGSI_FILE_IMAGE) {
+         next_token(ctx, &decl->Image);
       }
 
       if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index 35e1c7cfd62..5ed1a83b027 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -64,7 +64,7 @@ struct tgsi_full_declaration
    struct tgsi_declaration_dimension Dim;
    struct tgsi_declaration_interp Interp;
    struct tgsi_declaration_semantic Semantic;
-   struct tgsi_declaration_resource Resource;
+   struct tgsi_declaration_image Image;
    struct tgsi_declaration_sampler_view SamplerView;
    struct tgsi_declaration_array Array;
 };
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index fd926b37c47..ae30399376f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -54,8 +54,8 @@ static const char *tgsi_file_names[] =
    "IMM",
    "PRED",
    "SV",
-   "RES",
-   "SVIEW"
+   "IMAGE",
+   "SVIEW",
 };
 
 const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 4a82c9b3552..a45ab908ee0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1251,10 +1251,10 @@ static boolean parse_declaration( struct translate_ctx *ctx )
 
       cur++;
       eat_opt_white( &cur );
-      if (file == TGSI_FILE_RESOURCE) {
+      if (file == TGSI_FILE_IMAGE) {
          for (i = 0; i < TGSI_TEXTURE_COUNT; i++) {
             if (str_match_nocase_whole(&cur, tgsi_texture_names[i])) {
-               decl.Resource.Resource = i;
+               decl.Image.Resource = i;
                break;
             }
          }
@@ -1263,16 +1263,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             return FALSE;
          }
 
+         /* XXX format */
+
          cur2 = cur;
          eat_opt_white(&cur2);
          while (*cur2 == ',') {
             cur2++;
             eat_opt_white(&cur2);
             if (str_match_nocase_whole(&cur2, "RAW")) {
-               decl.Resource.Raw = 1;
+               decl.Image.Raw = 1;
 
             } else if (str_match_nocase_whole(&cur2, "WR")) {
-               decl.Resource.Writable = 1;
+               decl.Image.Writable = 1;
 
             } else {
                break;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 5b78542413e..59a4d3642a6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -50,6 +50,7 @@ union tgsi_any_token {
    struct tgsi_declaration_range decl_range;
    struct tgsi_declaration_dimension decl_dim;
    struct tgsi_declaration_interp decl_interp;
+   struct tgsi_declaration_image decl_image;
    struct tgsi_declaration_semantic decl_semantic;
    struct tgsi_declaration_sampler_view decl_sampler_view;
    struct tgsi_declaration_array array;
@@ -154,6 +155,15 @@ struct ureg_program
    } sampler_view[PIPE_MAX_SHADER_SAMPLER_VIEWS];
    unsigned nr_sampler_views;
 
+   struct {
+      unsigned index;
+      unsigned target;
+      unsigned format;
+      boolean wr;
+      boolean raw;
+   } image[PIPE_MAX_SHADER_IMAGES];
+   unsigned nr_images;
+
    struct util_bitmask *free_temps;
    struct util_bitmask *local_temps;
    struct util_bitmask *decl_temps;
@@ -656,6 +666,37 @@ ureg_DECL_sampler_view(struct ureg_program *ureg,
    return reg;
 }
 
+/* Allocate a new image.
+ */
+struct ureg_src
+ureg_DECL_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw)
+{
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_IMAGE, index);
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_images; i++)
+      if (ureg->image[i].index == index)
+         return reg;
+
+   if (i < PIPE_MAX_SHADER_IMAGES) {
+      ureg->image[i].index = index;
+      ureg->image[i].target = target;
+      ureg->image[i].wr = wr;
+      ureg->image[i].raw = raw;
+      ureg->image[i].format = format;
+      ureg->nr_images++;
+      return reg;
+   }
+
+   assert(0);
+   return reg;
+}
+
 static int
 match_or_expand_immediate64( const unsigned *v,
                              int type,
@@ -1485,6 +1526,33 @@ emit_decl_sampler_view(struct ureg_program *ureg,
    out[2].decl_sampler_view.ReturnTypeW = return_type_w;
 }
 
+static void
+emit_decl_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 3;
+   out[0].decl.File = TGSI_FILE_IMAGE;
+   out[0].decl.UsageMask = 0xf;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+
+   out[2].value = 0;
+   out[2].decl_image.Resource = target;
+   out[2].decl_image.Writable = wr;
+   out[2].decl_image.Raw      = raw;
+   out[2].decl_image.Format   = format;
+}
+
 static void
 emit_immediate( struct ureg_program *ureg,
                 const unsigned *v,
@@ -1644,6 +1712,15 @@ static void emit_decls( struct ureg_program *ureg )
                              ureg->sampler_view[i].return_type_w);
    }
 
+   for (i = 0; i < ureg->nr_images; i++) {
+      emit_decl_image(ureg,
+                      ureg->image[i].index,
+                      ureg->image[i].target,
+                      ureg->image[i].format,
+                      ureg->image[i].wr,
+                      ureg->image[i].raw);
+   }
+
    if (ureg->const_decls.nr_constant_ranges) {
       for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
          emit_decl_range(ureg,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 5f15ebac517..39b0a0f0ba1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -326,6 +326,13 @@ ureg_DECL_sampler_view(struct ureg_program *,
                        unsigned return_type_z,
                        unsigned return_type_w );
 
+struct ureg_src
+ureg_DECL_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw);
 
 static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c
index d38585f1475..9a7140b9a9b 100644
--- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
+++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
@@ -1593,7 +1593,7 @@ ra_get_type(struct toy_tgsi *tgsi, const struct tgsi_full_instruction *tgsi_inst
       tgsi_inst->Src[operand].Register.File;
    switch (file) {
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       type = TOY_TYPE_D;
       break;
@@ -1834,7 +1834,7 @@ ra_get_src_indirect(struct toy_tgsi *tgsi,
       src = tsrc_null();
       break;
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       is_resource = true;
       /* fall through */
@@ -1918,7 +1918,7 @@ ra_get_src(struct toy_tgsi *tgsi,
       need_vrf = true;
       break;
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       assert(!s->Register.Dimension);
       src = tsrc_imm_d(s->Register.Index);
@@ -2256,7 +2256,7 @@ parse_declaration(struct toy_tgsi *tgsi,
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_PREDICATE:
    case TGSI_FILE_ADDRESS:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
    case TGSI_FILE_SAMPLER_VIEW:
       /* nothing to do */
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 0d41c023db0..e3db975b26f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -352,7 +352,7 @@ static nv50_ir::DataFile translateFile(uint file)
    case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
    case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
    case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
-   case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   //case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
    case TGSI_FILE_SAMPLER:
    case TGSI_FILE_NULL:
    default:
@@ -871,7 +871,7 @@ bool Source::scanSource()
    clipVertexOutput = -1;
 
    textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
-   resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
 
    info->immd.bufSize = 0;
 
@@ -1159,6 +1159,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          }
       }
       break;
+/*
    case TGSI_FILE_RESOURCE:
       for (i = first; i <= last; ++i) {
          resources[i].target = decl->Resource.Resource;
@@ -1166,6 +1167,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          resources[i].slot = i;
       }
       break;
+*/
    case TGSI_FILE_SAMPLER_VIEW:
       for (i = first; i <= last; ++i)
          textureViews[i].target = decl->SamplerView.Resource;
@@ -1231,11 +1233,13 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (src.isIndirect(0))
             mainTempsInLMem = true;
       } else
+/*
       if (src.getFile() == TGSI_FILE_RESOURCE) {
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
             info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
       } else
+*/
       if (src.getFile() == TGSI_FILE_OUTPUT) {
          if (src.isIndirect(0)) {
             // We don't know which one is accessed, just mark everything for
@@ -1286,9 +1290,11 @@ Instruction::getTexture(const tgsi::Source *code, int s) const
    unsigned int r;
 
    switch (getSrc(s).getFile()) {
+/*
    case TGSI_FILE_RESOURCE:
       r = getSrc(s).getIndex(0);
       return translateTexture(code->resources.at(r).target);
+*/
    case TGSI_FILE_SAMPLER_VIEW:
       r = getSrc(s).getIndex(0);
       return translateTexture(code->textureViews.at(r).target);
@@ -1696,7 +1702,7 @@ Converter::acquireDst(int d, int c)
    const int idx = dst.getIndex(0);
    const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
-   if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
       return NULL;
 
    if (dst.isIndirect(0) ||
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index 098f6f5a28d..1223e446055 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2298,11 +2298,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
       emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
       return TRUE;
 
+#if 0
    case TGSI_FILE_RESOURCE:
       /*opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;*/
       /* XXX more, VGPU10_RETURN_TYPE_FLOAT */
       assert(!"TGSI_FILE_RESOURCE not handled yet");
       return FALSE;
+#endif
 
    case TGSI_FILE_ADDRESS:
       emit->num_address_regs = MAX2(emit->num_address_regs,
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index e8f4ad210e1..d18296276f7 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -76,7 +76,7 @@ enum tgsi_file_type {
    TGSI_FILE_IMMEDIATE           =7,
    TGSI_FILE_PREDICATE           =8,
    TGSI_FILE_SYSTEM_VALUE        =9,
-   TGSI_FILE_RESOURCE            =10,
+   TGSI_FILE_IMAGE               =10,
    TGSI_FILE_SAMPLER_VIEW        =11,
    TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
 };
@@ -197,11 +197,12 @@ struct tgsi_declaration_semantic
    unsigned Padding        : 8;
 };
 
-struct tgsi_declaration_resource {
+struct tgsi_declaration_image {
    unsigned Resource    : 8; /**< one of TGSI_TEXTURE_ */
    unsigned Raw         : 1;
    unsigned Writable    : 1;
-   unsigned Padding     : 22;
+   unsigned Format      : 10; /**< one of PIPE_FORMAT_ */
+   unsigned Padding     : 12;
 };
 
 enum tgsi_return_type {

From 888ddd632d7f6af635cc948f1b3e8982b43800d2 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 19 Sep 2015 18:19:13 -0400
Subject: [PATCH 222/241] ureg: add buffer support to ureg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_dump.c     |  5 +++
 src/gallium/auxiliary/tgsi/tgsi_strings.c  |  1 +
 src/gallium/auxiliary/tgsi/tgsi_text.c     |  5 +++
 src/gallium/auxiliary/tgsi/tgsi_ureg.c     | 52 ++++++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ureg.h     |  3 ++
 src/gallium/include/pipe/p_shader_tokens.h |  4 +-
 6 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index dad3839d897..de3aae5337c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -359,6 +359,11 @@ iter_declaration(
          TXT(", RAW");
    }
 
+   if (decl->Declaration.File == TGSI_FILE_BUFFER) {
+      if (decl->Declaration.Atomic)
+         TXT(", ATOMIC");
+   }
+
    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
       TXT(", ");
       ENM(decl->SamplerView.Resource, tgsi_texture_names);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index ae30399376f..c0dd04497f7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -56,6 +56,7 @@ static const char *tgsi_file_names[] =
    "SV",
    "IMAGE",
    "SVIEW",
+   "BUFFER",
 };
 
 const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index a45ab908ee0..d72d843951a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1350,6 +1350,11 @@ static boolean parse_declaration( struct translate_ctx *ctx )
                decl.SamplerView.ReturnTypeX;
          }
          ctx->cur = cur;
+      } else if (file == TGSI_FILE_BUFFER) {
+         if (str_match_nocase_whole(&cur, "ATOMIC")) {
+            decl.Declaration.Atomic = 1;
+            ctx->cur = cur;
+         }
       } else {
          if (str_match_nocase_whole(&cur, "LOCAL")) {
             decl.Declaration.Local = 1;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 59a4d3642a6..0ad23dd5e56 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -164,6 +164,12 @@ struct ureg_program
    } image[PIPE_MAX_SHADER_IMAGES];
    unsigned nr_images;
 
+   struct {
+      unsigned index;
+      bool atomic;
+   } buffer[PIPE_MAX_SHADER_BUFFERS];
+   unsigned nr_buffers;
+
    struct util_bitmask *free_temps;
    struct util_bitmask *local_temps;
    struct util_bitmask *decl_temps;
@@ -697,6 +703,29 @@ ureg_DECL_image(struct ureg_program *ureg,
    return reg;
 }
 
+/* Allocate a new buffer.
+ */
+struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
+                                 bool atomic)
+{
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_BUFFER, nr);
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_buffers; i++)
+      if (ureg->buffer[i].index == nr)
+         return reg;
+
+   if (i < PIPE_MAX_SHADER_BUFFERS) {
+      ureg->buffer[i].index = nr;
+      ureg->buffer[i].atomic = atomic;
+      ureg->nr_buffers++;
+      return reg;
+   }
+
+   assert(0);
+   return reg;
+}
+
 static int
 match_or_expand_immediate64( const unsigned *v,
                              int type,
@@ -1553,6 +1582,25 @@ emit_decl_image(struct ureg_program *ureg,
    out[2].decl_image.Format   = format;
 }
 
+static void
+emit_decl_buffer(struct ureg_program *ureg,
+                 unsigned index,
+                 bool atomic)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 2;
+   out[0].decl.File = TGSI_FILE_BUFFER;
+   out[0].decl.UsageMask = 0xf;
+   out[0].decl.Atomic = atomic;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+}
+
 static void
 emit_immediate( struct ureg_program *ureg,
                 const unsigned *v,
@@ -1721,6 +1769,10 @@ static void emit_decls( struct ureg_program *ureg )
                       ureg->image[i].raw);
    }
 
+   for (i = 0; i < ureg->nr_buffers; i++) {
+      emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
+   }
+
    if (ureg->const_decls.nr_constant_ranges) {
       for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
          emit_decl_range(ureg,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 39b0a0f0ba1..4a411c66491 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -334,6 +334,9 @@ ureg_DECL_image(struct ureg_program *ureg,
                 boolean wr,
                 boolean raw);
 
+struct ureg_src
+ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);
+
 static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
                        float a, float b,
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index d18296276f7..815aff1dab3 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -78,6 +78,7 @@ enum tgsi_file_type {
    TGSI_FILE_SYSTEM_VALUE        =9,
    TGSI_FILE_IMAGE               =10,
    TGSI_FILE_SAMPLER_VIEW        =11,
+   TGSI_FILE_BUFFER              =12,
    TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
 };
 
@@ -127,7 +128,8 @@ struct tgsi_declaration
    unsigned Invariant   : 1;  /**< invariant optimization? */
    unsigned Local       : 1;  /**< optimize as subroutine local variable? */
    unsigned Array       : 1;  /**< extra array info? */
-   unsigned Padding     : 6;
+   unsigned Atomic      : 1;  /**< atomic only? for TGSI_FILE_BUFFER */
+   unsigned Padding     : 5;
 };
 
 struct tgsi_declaration_range

From 50b8488926c4fa45ed79148217b81e54252788e7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 7 Nov 2015 02:25:20 -0500
Subject: [PATCH 223/241] tgsi: provide a way to encode memory qualifiers for
 SSBO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each load/store on most hardware can specify what caching to do. Since
SSBO allows individual variables to also have separate caching modes,
allow loads/stores to have the qualifiers instead of attempting to
encode them in declarations.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_build.c    | 50 ++++++++++++++++++++-
 src/gallium/auxiliary/tgsi/tgsi_dump.c     | 10 +++++
 src/gallium/auxiliary/tgsi/tgsi_parse.c    |  4 ++
 src/gallium/auxiliary/tgsi/tgsi_parse.h    |  1 +
 src/gallium/auxiliary/tgsi/tgsi_strings.c  |  7 +++
 src/gallium/auxiliary/tgsi/tgsi_strings.h  |  2 +
 src/gallium/auxiliary/tgsi/tgsi_text.c     | 27 +++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ureg.c     | 52 ++++++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ureg.h     | 13 ++++++
 src/gallium/include/pipe/p_shader_tokens.h | 16 ++++++-
 10 files changed, 180 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index bb9d0cbe25d..ea207461d27 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -620,7 +620,8 @@ tgsi_default_instruction( void )
    instruction.NumSrcRegs = 1;
    instruction.Label = 0;
    instruction.Texture = 0;
-   instruction.Padding  = 0;
+   instruction.Memory = 0;
+   instruction.Padding = 0;
 
    return instruction;
 }
@@ -766,6 +767,34 @@ tgsi_build_instruction_texture(
    return instruction_texture;
 }
 
+static struct tgsi_instruction_memory
+tgsi_default_instruction_memory( void )
+{
+   struct tgsi_instruction_memory instruction_memory;
+
+   instruction_memory.Qualifier = 0;
+   instruction_memory.Padding = 0;
+
+   return instruction_memory;
+}
+
+static struct tgsi_instruction_memory
+tgsi_build_instruction_memory(
+   unsigned qualifier,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_memory instruction_memory;
+
+   instruction_memory.Qualifier = qualifier;
+   instruction_memory.Padding = 0;
+   instruction->Memory = 1;
+
+   instruction_grow( instruction, header );
+
+   return instruction_memory;
+}
 
 static struct tgsi_texture_offset
 tgsi_default_texture_offset( void )
@@ -1012,6 +1041,7 @@ tgsi_default_full_instruction( void )
    full_instruction.Predicate = tgsi_default_instruction_predicate();
    full_instruction.Label = tgsi_default_instruction_label();
    full_instruction.Texture = tgsi_default_instruction_texture();
+   full_instruction.Memory = tgsi_default_instruction_memory();
    for( i = 0;  i < TGSI_FULL_MAX_TEX_OFFSETS; i++ ) {
       full_instruction.TexOffsets[i] = tgsi_default_texture_offset();
    }
@@ -1123,6 +1153,24 @@ tgsi_build_full_instruction(
          prev_token = (struct tgsi_token *) texture_offset;
       }
    }
+
+   if (full_inst->Instruction.Memory) {
+      struct tgsi_instruction_memory *instruction_memory;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_memory =
+         (struct  tgsi_instruction_memory *) &tokens[size];
+      size++;
+
+      *instruction_memory = tgsi_build_instruction_memory(
+         full_inst->Memory.Qualifier,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_memory;
+   }
+
    for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
       const struct tgsi_full_dst_register *reg = &full_inst->Dst[i];
       struct tgsi_dst_register *dst_register;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index de3aae5337c..2ad29b9d49a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -624,6 +624,16 @@ iter_instruction(
       }
    }
 
+   if (inst->Instruction.Memory) {
+      uint32_t qualifier = inst->Memory.Qualifier;
+      while (qualifier) {
+         int bit = ffs(qualifier) - 1;
+         qualifier &= ~(1U << bit);
+         TXT(", ");
+         ENM(bit, tgsi_memory_names);
+      }
+   }
+
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_IF:
    case TGSI_OPCODE_UIF:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 9a52bbbf5cb..ae95ebd82a4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -195,6 +195,10 @@ tgsi_parse_token(
          }
       }
 
+      if (inst->Instruction.Memory) {
+         next_token(ctx, &inst->Memory);
+      }
+
       assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
 
       for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index 5ed1a83b027..4689fb797d0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -91,6 +91,7 @@ struct tgsi_full_instruction
    struct tgsi_instruction_predicate   Predicate;
    struct tgsi_instruction_label       Label;
    struct tgsi_instruction_texture     Texture;
+   struct tgsi_instruction_memory      Memory;
    struct tgsi_full_dst_register       Dst[TGSI_FULL_MAX_DST_REGISTERS];
    struct tgsi_full_src_register       Src[TGSI_FULL_MAX_SRC_REGISTERS];
    struct tgsi_texture_offset          TexOffsets[TGSI_FULL_MAX_TEX_OFFSETS];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index c0dd04497f7..f2d70d49839 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -208,6 +208,13 @@ const char *tgsi_immediate_type_names[4] =
    "FLT64"
 };
 
+const char *tgsi_memory_names[3] =
+{
+   "COHERENT",
+   "RESTRICT",
+   "VOLATILE",
+};
+
 
 static inline void
 tgsi_strings_check(void)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.h b/src/gallium/auxiliary/tgsi/tgsi_strings.h
index 71e74372f22..031d32278cc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.h
@@ -60,6 +60,8 @@ extern const char *tgsi_fs_coord_pixel_center_names[2];
 
 extern const char *tgsi_immediate_type_names[4];
 
+extern const char *tgsi_memory_names[3];
+
 
 const char *
 tgsi_file_name(unsigned file);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index d72d843951a..97b1869a66f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1039,6 +1039,12 @@ parse_instruction(
       inst.Texture.Texture = TGSI_TEXTURE_UNKNOWN;
    }
 
+   if ((i >= TGSI_OPCODE_LOAD && i <= TGSI_OPCODE_ATOMIMAX) ||
+       i == TGSI_OPCODE_RESQ) {
+      inst.Instruction.Memory = 1;
+      inst.Memory.Qualifier = 0;
+   }
+
    /* Parse instruction operands.
     */
    for (i = 0; i < info->num_dst + info->num_src + info->is_tex; i++) {
@@ -1090,6 +1096,27 @@ parse_instruction(
    }
    inst.Texture.NumOffsets = i;
 
+   cur = ctx->cur;
+   eat_opt_white(&cur);
+   for (i = 0; inst.Instruction.Memory && *cur == ','; i++) {
+      uint j;
+      cur++;
+      eat_opt_white(&cur);
+      ctx->cur = cur;
+      for (j = 0; j < 3; j++) {
+         if (str_match_nocase_whole(&ctx->cur, tgsi_memory_names[j])) {
+            inst.Memory.Qualifier |= 1U << j;
+            break;
+         }
+      }
+      if (j == 3) {
+         report_error(ctx, "Expected memory qualifier");
+         return FALSE;
+      }
+      cur = ctx->cur;
+      eat_opt_white(&cur);
+   }
+
    cur = ctx->cur;
    eat_opt_white( &cur );
    if (info->is_branch && *cur == ':') {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 0ad23dd5e56..d6811501d16 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -60,6 +60,7 @@ union tgsi_any_token {
    struct tgsi_instruction_predicate insn_predicate;
    struct tgsi_instruction_label insn_label;
    struct tgsi_instruction_texture insn_texture;
+   struct tgsi_instruction_memory insn_memory;
    struct tgsi_texture_offset insn_texture_offset;
    struct tgsi_src_register src;
    struct tgsi_ind_register ind;
@@ -1226,6 +1227,21 @@ ureg_emit_texture_offset(struct ureg_program *ureg,
    
 }
 
+void
+ureg_emit_memory(struct ureg_program *ureg,
+                 unsigned extended_token,
+                 unsigned qualifier)
+{
+   union tgsi_any_token *out, *insn;
+
+   out = get_tokens( ureg, DOMAIN_INSN, 1 );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
+
+   insn->insn.Memory = 1;
+
+   out[0].value = 0;
+   out[0].insn_memory.Qualifier = qualifier;
+}
 
 void
 ureg_fixup_insn_size(struct ureg_program *ureg,
@@ -1378,6 +1394,42 @@ ureg_label_insn(struct ureg_program *ureg,
 }
 
 
+void
+ureg_memory_insn(struct ureg_program *ureg,
+                 unsigned opcode,
+                 const struct ureg_dst *dst,
+                 unsigned nr_dst,
+                 const struct ureg_src *src,
+                 unsigned nr_src,
+                 unsigned qualifier)
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         FALSE,
+                         FALSE,
+                         FALSE,
+                         TGSI_SWIZZLE_X,
+                         TGSI_SWIZZLE_Y,
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W,
+                         nr_dst,
+                         nr_src);
+
+   ureg_emit_memory(ureg, insn.extended_token, qualifier);
+
+   for (i = 0; i < nr_dst; i++)
+      ureg_emit_dst(ureg, dst[i]);
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src(ureg, src[i]);
+
+   ureg_fixup_insn_size(ureg, insn.insn_token);
+}
+
+
 static void
 emit_decl_semantic(struct ureg_program *ureg,
                    unsigned file,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 4a411c66491..86e58a91343 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -531,6 +531,14 @@ ureg_label_insn(struct ureg_program *ureg,
                 unsigned nr_src,
                 unsigned *label);
 
+void
+ureg_memory_insn(struct ureg_program *ureg,
+                 unsigned opcode,
+                 const struct ureg_dst *dst,
+                 unsigned nr_dst,
+                 const struct ureg_src *src,
+                 unsigned nr_src,
+                 unsigned qualifier);
 
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
@@ -568,6 +576,11 @@ void
 ureg_emit_texture_offset(struct ureg_program *ureg,
                          const struct tgsi_texture_offset *offset);
 
+void
+ureg_emit_memory(struct ureg_program *ureg,
+                 unsigned insn_token,
+                 unsigned qualifier);
+
 void 
 ureg_emit_dst( struct ureg_program *ureg,
                struct ureg_dst dst );
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 815aff1dab3..43a5561882c 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -572,7 +572,8 @@ struct tgsi_instruction
    unsigned Predicate  : 1;  /* BOOL */
    unsigned Label      : 1;
    unsigned Texture    : 1;
-   unsigned Padding    : 2;
+   unsigned Memory     : 1;
+   unsigned Padding    : 1;
 };
 
 /*
@@ -729,6 +730,19 @@ struct tgsi_dst_register
    unsigned Padding     : 6;
 };
 
+#define TGSI_MEMORY_COHERENT (1 << 0)
+#define TGSI_MEMORY_RESTRICT (1 << 1)
+#define TGSI_MEMORY_VOLATILE (1 << 2)
+
+/**
+ * Specifies the type of memory access to do for the LOAD/STORE instruction.
+ */
+struct tgsi_instruction_memory
+{
+   unsigned Qualifier : 3;  /* TGSI_MEMORY_ */
+   unsigned Padding   : 29;
+};
+
 
 #ifdef __cplusplus
 }

From bdef02ff26a2626ee5882f67986ee0806c15a552 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 26 Sep 2015 17:35:41 -0400
Subject: [PATCH 224/241] tgsi: add a is_store property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_info.c | 446 ++++++++++++-------------
 src/gallium/auxiliary/tgsi/tgsi_info.h |   1 +
 2 files changed, 224 insertions(+), 223 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index c078b6f94ee..b94aa63cac5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -37,231 +37,231 @@
 
 static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
 {
-   { 1, 1, 0, 0, 0, 0, COMP, "ARL", TGSI_OPCODE_ARL },
-   { 1, 1, 0, 0, 0, 0, COMP, "MOV", TGSI_OPCODE_MOV },
-   { 1, 1, 0, 0, 0, 0, CHAN, "LIT", TGSI_OPCODE_LIT },
-   { 1, 1, 0, 0, 0, 0, REPL, "RCP", TGSI_OPCODE_RCP },
-   { 1, 1, 0, 0, 0, 0, REPL, "RSQ", TGSI_OPCODE_RSQ },
-   { 1, 1, 0, 0, 0, 0, CHAN, "EXP", TGSI_OPCODE_EXP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "LOG", TGSI_OPCODE_LOG },
-   { 1, 2, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
-   { 1, 2, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
-   { 1, 2, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
-   { 1, 3, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
-   { 1, 2, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
-   { 1, 3, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
-   { 1, 3, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
-   { 1, 1, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
-   { 1, 3, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 22 },      /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "", 23 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
-   { 1, 3, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
-   { 1, 1, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
-   { 1, 1, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
-   { 1, 1, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
-   { 1, 1, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
-   { 1, 2, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
-   { 1, 2, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 32 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "ABS", TGSI_OPCODE_ABS },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 34 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
-   { 1, 1, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
-   { 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
-   { 1, 1, 0, 0, 0, 0, REPL, "PK2H", TGSI_OPCODE_PK2H },
-   { 1, 1, 0, 0, 0, 0, REPL, "PK2US", TGSI_OPCODE_PK2US },
-   { 1, 1, 0, 0, 0, 0, REPL, "PK4B", TGSI_OPCODE_PK4B },
-   { 1, 1, 0, 0, 0, 0, REPL, "PK4UB", TGSI_OPCODE_PK4UB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "SGT", TGSI_OPCODE_SGT },
-   { 1, 1, 0, 0, 0, 0, REPL, "SIN", TGSI_OPCODE_SIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "SLE", TGSI_OPCODE_SLE },
-   { 1, 2, 0, 0, 0, 0, COMP, "SNE", TGSI_OPCODE_SNE },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 51 },      /* removed */
-   { 1, 2, 1, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
-   { 1, 4, 1, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "UP2H", TGSI_OPCODE_UP2H },
-   { 1, 1, 0, 0, 0, 0, CHAN, "UP2US", TGSI_OPCODE_UP2US },
-   { 1, 1, 0, 0, 0, 0, CHAN, "UP4B", TGSI_OPCODE_UP4B },
-   { 1, 1, 0, 0, 0, 0, CHAN, "UP4UB", TGSI_OPCODE_UP4UB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
-   { 0, 1, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 62 },      /* removed */
-   { 0, 0, 0, 1, 0, 0, NONE, "CAL", TGSI_OPCODE_CAL },
-   { 0, 0, 0, 0, 0, 0, NONE, "RET", TGSI_OPCODE_RET },
-   { 1, 1, 0, 0, 0, 0, COMP, "SSG", TGSI_OPCODE_SSG },
-   { 1, 3, 0, 0, 0, 0, COMP, "CMP", TGSI_OPCODE_CMP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "SCS", TGSI_OPCODE_SCS },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXB", TGSI_OPCODE_TXB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 69 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "DIV", TGSI_OPCODE_DIV },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP2", TGSI_OPCODE_DP2 },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXL", TGSI_OPCODE_TXL },
-   { 0, 0, 0, 0, 0, 0, NONE, "BRK", TGSI_OPCODE_BRK },
-   { 0, 1, 0, 1, 0, 1, NONE, "IF", TGSI_OPCODE_IF },
-   { 0, 1, 0, 1, 0, 1, NONE, "UIF", TGSI_OPCODE_UIF },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 76 },      /* removed */
-   { 0, 0, 0, 1, 1, 1, NONE, "ELSE", TGSI_OPCODE_ELSE },
-   { 0, 0, 0, 0, 1, 0, NONE, "ENDIF", TGSI_OPCODE_ENDIF },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDX_FINE", TGSI_OPCODE_DDX_FINE },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDY_FINE", TGSI_OPCODE_DDY_FINE },
-   { 0, 1, 0, 0, 0, 0, NONE, "PUSHA", TGSI_OPCODE_PUSHA },
-   { 1, 0, 0, 0, 0, 0, NONE, "POPA", TGSI_OPCODE_POPA },
-   { 1, 1, 0, 0, 0, 0, COMP, "CEIL", TGSI_OPCODE_CEIL },
-   { 1, 1, 0, 0, 0, 0, COMP, "I2F", TGSI_OPCODE_I2F },
-   { 1, 1, 0, 0, 0, 0, COMP, "NOT", TGSI_OPCODE_NOT },
-   { 1, 1, 0, 0, 0, 0, COMP, "TRUNC", TGSI_OPCODE_TRUNC },
-   { 1, 2, 0, 0, 0, 0, COMP, "SHL", TGSI_OPCODE_SHL },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 88 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "AND", TGSI_OPCODE_AND },
-   { 1, 2, 0, 0, 0, 0, COMP, "OR", TGSI_OPCODE_OR },
-   { 1, 2, 0, 0, 0, 0, COMP, "MOD", TGSI_OPCODE_MOD },
-   { 1, 2, 0, 0, 0, 0, COMP, "XOR", TGSI_OPCODE_XOR },
-   { 1, 3, 0, 0, 0, 0, COMP, "SAD", TGSI_OPCODE_SAD },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXF", TGSI_OPCODE_TXF },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXQ", TGSI_OPCODE_TXQ },
-   { 0, 0, 0, 0, 0, 0, NONE, "CONT", TGSI_OPCODE_CONT },
-   { 0, 1, 0, 0, 0, 0, NONE, "EMIT", TGSI_OPCODE_EMIT },
-   { 0, 1, 0, 0, 0, 0, NONE, "ENDPRIM", TGSI_OPCODE_ENDPRIM },
-   { 0, 0, 0, 1, 0, 1, NONE, "BGNLOOP", TGSI_OPCODE_BGNLOOP },
-   { 0, 0, 0, 0, 0, 1, NONE, "BGNSUB", TGSI_OPCODE_BGNSUB },
-   { 0, 0, 0, 1, 1, 0, NONE, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
-   { 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
-   { 1, 1, 1, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
-   { 1, 1, 1, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 105 },     /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
-   { 0, 1, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
-   { 0, 1, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
-   { 0, 1, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
-   { 0, 1, 0, 0, 0, 0, NONE, "KILL_IF", TGSI_OPCODE_KILL_IF },
-   { 0, 0, 0, 0, 0, 0, NONE, "END", TGSI_OPCODE_END },
-   { 1, 3, 0, 0, 0, 0, COMP, "DFMA", TGSI_OPCODE_DFMA },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2I", TGSI_OPCODE_F2I },
-   { 1, 2, 0, 0, 0, 0, COMP, "IDIV", TGSI_OPCODE_IDIV },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMAX", TGSI_OPCODE_IMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMIN", TGSI_OPCODE_IMIN },
-   { 1, 1, 0, 0, 0, 0, COMP, "INEG", TGSI_OPCODE_INEG },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISGE", TGSI_OPCODE_ISGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISHR", TGSI_OPCODE_ISHR },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISLT", TGSI_OPCODE_ISLT },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2U", TGSI_OPCODE_F2U },
-   { 1, 1, 0, 0, 0, 0, COMP, "U2F", TGSI_OPCODE_U2F },
-   { 1, 2, 0, 0, 0, 0, COMP, "UADD", TGSI_OPCODE_UADD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UDIV", TGSI_OPCODE_UDIV },
-   { 1, 3, 0, 0, 0, 0, COMP, "UMAD", TGSI_OPCODE_UMAD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMAX", TGSI_OPCODE_UMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMIN", TGSI_OPCODE_UMIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMOD", TGSI_OPCODE_UMOD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMUL", TGSI_OPCODE_UMUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "USEQ", TGSI_OPCODE_USEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "USGE", TGSI_OPCODE_USGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "USHR", TGSI_OPCODE_USHR },
-   { 1, 2, 0, 0, 0, 0, COMP, "USLT", TGSI_OPCODE_USLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "USNE", TGSI_OPCODE_USNE },
-   { 0, 1, 0, 0, 0, 0, NONE, "SWITCH", TGSI_OPCODE_SWITCH },
-   { 0, 1, 0, 0, 0, 0, NONE, "CASE", TGSI_OPCODE_CASE },
-   { 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
-   { 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ARL", TGSI_OPCODE_ARL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "MOV", TGSI_OPCODE_MOV },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "LIT", TGSI_OPCODE_LIT },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "RCP", TGSI_OPCODE_RCP },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "RSQ", TGSI_OPCODE_RSQ },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "EXP", TGSI_OPCODE_EXP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "LOG", TGSI_OPCODE_LOG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
+   { 1, 2, 0, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
+   { 1, 3, 0, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 22 },      /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 23 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 32 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ABS", TGSI_OPCODE_ABS },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 34 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2H", TGSI_OPCODE_PK2H },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2US", TGSI_OPCODE_PK2US },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4B", TGSI_OPCODE_PK4B },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4UB", TGSI_OPCODE_PK4UB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SGT", TGSI_OPCODE_SGT },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "SIN", TGSI_OPCODE_SIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SLE", TGSI_OPCODE_SLE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SNE", TGSI_OPCODE_SNE },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 51 },      /* removed */
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
+   { 1, 4, 1, 0, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP2H", TGSI_OPCODE_UP2H },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP2US", TGSI_OPCODE_UP2US },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP4B", TGSI_OPCODE_UP4B },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP4UB", TGSI_OPCODE_UP4UB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 62 },      /* removed */
+   { 0, 0, 0, 0, 1, 0, 0, NONE, "CAL", TGSI_OPCODE_CAL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "RET", TGSI_OPCODE_RET },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "SSG", TGSI_OPCODE_SSG },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "CMP", TGSI_OPCODE_CMP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "SCS", TGSI_OPCODE_SCS },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXB", TGSI_OPCODE_TXB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 69 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DIV", TGSI_OPCODE_DIV },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP2", TGSI_OPCODE_DP2 },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXL", TGSI_OPCODE_TXL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "BRK", TGSI_OPCODE_BRK },
+   { 0, 1, 0, 0, 1, 0, 1, NONE, "IF", TGSI_OPCODE_IF },
+   { 0, 1, 0, 0, 1, 0, 1, NONE, "UIF", TGSI_OPCODE_UIF },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 76 },      /* removed */
+   { 0, 0, 0, 0, 1, 1, 1, NONE, "ELSE", TGSI_OPCODE_ELSE },
+   { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDIF", TGSI_OPCODE_ENDIF },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX_FINE", TGSI_OPCODE_DDX_FINE },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY_FINE", TGSI_OPCODE_DDY_FINE },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "PUSHA", TGSI_OPCODE_PUSHA },
+   { 1, 0, 0, 0, 0, 0, 0, NONE, "POPA", TGSI_OPCODE_POPA },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "CEIL", TGSI_OPCODE_CEIL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2F", TGSI_OPCODE_I2F },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "NOT", TGSI_OPCODE_NOT },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "TRUNC", TGSI_OPCODE_TRUNC },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SHL", TGSI_OPCODE_SHL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 88 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "AND", TGSI_OPCODE_AND },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "OR", TGSI_OPCODE_OR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MOD", TGSI_OPCODE_MOD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "XOR", TGSI_OPCODE_XOR },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "SAD", TGSI_OPCODE_SAD },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXF", TGSI_OPCODE_TXF },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXQ", TGSI_OPCODE_TXQ },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "CONT", TGSI_OPCODE_CONT },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "EMIT", TGSI_OPCODE_EMIT },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "ENDPRIM", TGSI_OPCODE_ENDPRIM },
+   { 0, 0, 0, 0, 1, 0, 1, NONE, "BGNLOOP", TGSI_OPCODE_BGNLOOP },
+   { 0, 0, 0, 0, 0, 0, 1, NONE, "BGNSUB", TGSI_OPCODE_BGNSUB },
+   { 0, 0, 0, 0, 1, 1, 0, NONE, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
+   { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
+   { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
+   { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 105 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "KILL_IF", TGSI_OPCODE_KILL_IF },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "END", TGSI_OPCODE_END },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "DFMA", TGSI_OPCODE_DFMA },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2I", TGSI_OPCODE_F2I },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IDIV", TGSI_OPCODE_IDIV },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMAX", TGSI_OPCODE_IMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMIN", TGSI_OPCODE_IMIN },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "INEG", TGSI_OPCODE_INEG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISGE", TGSI_OPCODE_ISGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISHR", TGSI_OPCODE_ISHR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISLT", TGSI_OPCODE_ISLT },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2U", TGSI_OPCODE_F2U },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2F", TGSI_OPCODE_U2F },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UADD", TGSI_OPCODE_UADD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UDIV", TGSI_OPCODE_UDIV },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UMAD", TGSI_OPCODE_UMAD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMAX", TGSI_OPCODE_UMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMIN", TGSI_OPCODE_UMIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMOD", TGSI_OPCODE_UMOD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMUL", TGSI_OPCODE_UMUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USEQ", TGSI_OPCODE_USEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USGE", TGSI_OPCODE_USGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USHR", TGSI_OPCODE_USHR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USLT", TGSI_OPCODE_USLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USNE", TGSI_OPCODE_USNE },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "SWITCH", TGSI_OPCODE_SWITCH },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "CASE", TGSI_OPCODE_CASE },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
 
-   { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
-   { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
-   { 1, 5, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
-   { 1, 3, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
-   { 1, 1, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
-   { 1, 3, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
-   { 1, 1, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
-   { 1, 1, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
-   { 1, 2, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
-   { 1, 2, 0, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
-   { 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
+   { 1, 5, 0, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
+   { 1, 2, 0, 1, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
+   { 0, 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
 
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
-   { 1, 4, 0, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TEX2", TGSI_OPCODE_TEX2 },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TXB2", TGSI_OPCODE_TXB2 },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TXL2", TGSI_OPCODE_TXL2 },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMUL_HI", TGSI_OPCODE_IMUL_HI },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMUL_HI", TGSI_OPCODE_UMUL_HI },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TG4", TGSI_OPCODE_TG4 },
-   { 1, 2, 1, 0, 0, 0, OTHR, "LODQ", TGSI_OPCODE_LODQ },
-   { 1, 3, 0, 0, 0, 0, COMP, "IBFE", TGSI_OPCODE_IBFE },
-   { 1, 3, 0, 0, 0, 0, COMP, "UBFE", TGSI_OPCODE_UBFE },
-   { 1, 4, 0, 0, 0, 0, COMP, "BFI", TGSI_OPCODE_BFI },
-   { 1, 1, 0, 0, 0, 0, COMP, "BREV", TGSI_OPCODE_BREV },
-   { 1, 1, 0, 0, 0, 0, COMP, "POPC", TGSI_OPCODE_POPC },
-   { 1, 1, 0, 0, 0, 0, COMP, "LSB", TGSI_OPCODE_LSB },
-   { 1, 1, 0, 0, 0, 0, COMP, "IMSB", TGSI_OPCODE_IMSB },
-   { 1, 1, 0, 0, 0, 0, COMP, "UMSB", TGSI_OPCODE_UMSB },
-   { 1, 1, 0, 0, 0, 0, OTHR, "INTERP_CENTROID", TGSI_OPCODE_INTERP_CENTROID },
-   { 1, 2, 0, 0, 0, 0, OTHR, "INTERP_SAMPLE", TGSI_OPCODE_INTERP_SAMPLE },
-   { 1, 2, 0, 0, 0, 0, OTHR, "INTERP_OFFSET", TGSI_OPCODE_INTERP_OFFSET },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2D", TGSI_OPCODE_F2D },
-   { 1, 1, 0, 0, 0, 0, COMP, "D2F", TGSI_OPCODE_D2F },
-   { 1, 1, 0, 0, 0, 0, COMP, "DABS", TGSI_OPCODE_DABS },
-   { 1, 1, 0, 0, 0, 0, COMP, "DNEG", TGSI_OPCODE_DNEG },
-   { 1, 2, 0, 0, 0, 0, COMP, "DADD", TGSI_OPCODE_DADD },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMUL", TGSI_OPCODE_DMUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMAX", TGSI_OPCODE_DMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMIN", TGSI_OPCODE_DMIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSLT", TGSI_OPCODE_DSLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSGE", TGSI_OPCODE_DSGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSEQ", TGSI_OPCODE_DSEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSNE", TGSI_OPCODE_DSNE },
-   { 1, 1, 0, 0, 0, 0, COMP, "DRCP", TGSI_OPCODE_DRCP },
-   { 1, 1, 0, 0 ,0, 0, COMP, "DSQRT", TGSI_OPCODE_DSQRT },
-   { 1, 3, 0, 0 ,0, 0, COMP, "DMAD", TGSI_OPCODE_DMAD },
-   { 1, 1, 0, 0, 0, 0, COMP, "DFRAC", TGSI_OPCODE_DFRAC},
-   { 1, 2, 0, 0, 0, 0, COMP, "DLDEXP", TGSI_OPCODE_DLDEXP},
-   { 2, 1, 0, 0, 0, 0, COMP, "DFRACEXP", TGSI_OPCODE_DFRACEXP},
-   { 1, 1, 0, 0, 0, 0, COMP, "D2I", TGSI_OPCODE_D2I },
-   { 1, 1, 0, 0, 0, 0, COMP, "I2D", TGSI_OPCODE_I2D },
-   { 1, 1, 0, 0, 0, 0, COMP, "D2U", TGSI_OPCODE_D2U },
-   { 1, 1, 0, 0, 0, 0, COMP, "U2D", TGSI_OPCODE_U2D },
-   { 1, 1, 0, 0 ,0, 0, COMP, "DRSQ", TGSI_OPCODE_DRSQ },
-   { 1, 1, 0, 0, 0, 0, COMP, "DTRUNC", TGSI_OPCODE_DTRUNC },
-   { 1, 1, 0, 0, 0, 0, COMP, "DCEIL", TGSI_OPCODE_DCEIL },
-   { 1, 1, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
-   { 1, 1, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
-   { 1, 1, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
+   { 1, 4, 0, 1, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TEX2", TGSI_OPCODE_TEX2 },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TXB2", TGSI_OPCODE_TXB2 },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TXL2", TGSI_OPCODE_TXL2 },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMUL_HI", TGSI_OPCODE_IMUL_HI },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMUL_HI", TGSI_OPCODE_UMUL_HI },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TG4", TGSI_OPCODE_TG4 },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "LODQ", TGSI_OPCODE_LODQ },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "IBFE", TGSI_OPCODE_IBFE },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UBFE", TGSI_OPCODE_UBFE },
+   { 1, 4, 0, 0, 0, 0, 0, COMP, "BFI", TGSI_OPCODE_BFI },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "BREV", TGSI_OPCODE_BREV },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "POPC", TGSI_OPCODE_POPC },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "LSB", TGSI_OPCODE_LSB },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "IMSB", TGSI_OPCODE_IMSB },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "UMSB", TGSI_OPCODE_UMSB },
+   { 1, 1, 0, 0, 0, 0, 0, OTHR, "INTERP_CENTROID", TGSI_OPCODE_INTERP_CENTROID },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "INTERP_SAMPLE", TGSI_OPCODE_INTERP_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "INTERP_OFFSET", TGSI_OPCODE_INTERP_OFFSET },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2D", TGSI_OPCODE_F2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2F", TGSI_OPCODE_D2F },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DABS", TGSI_OPCODE_DABS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DNEG", TGSI_OPCODE_DNEG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DADD", TGSI_OPCODE_DADD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMUL", TGSI_OPCODE_DMUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMAX", TGSI_OPCODE_DMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMIN", TGSI_OPCODE_DMIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSLT", TGSI_OPCODE_DSLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSGE", TGSI_OPCODE_DSGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSEQ", TGSI_OPCODE_DSEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSNE", TGSI_OPCODE_DSNE },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DRCP", TGSI_OPCODE_DRCP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSQRT", TGSI_OPCODE_DSQRT },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "DMAD", TGSI_OPCODE_DMAD },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFRAC", TGSI_OPCODE_DFRAC},
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DLDEXP", TGSI_OPCODE_DLDEXP},
+   { 2, 1, 0, 0, 0, 0, 0, COMP, "DFRACEXP", TGSI_OPCODE_DFRACEXP},
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2I", TGSI_OPCODE_D2I },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2D", TGSI_OPCODE_I2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2U", TGSI_OPCODE_D2U },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2D", TGSI_OPCODE_U2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DRSQ", TGSI_OPCODE_DRSQ },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DTRUNC", TGSI_OPCODE_DTRUNC },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DCEIL", TGSI_OPCODE_DCEIL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h
index aa7edd1e114..46f03cd393f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -74,6 +74,7 @@ struct tgsi_opcode_info
    unsigned num_dst:3;
    unsigned num_src:3;
    unsigned is_tex:1;
+   unsigned is_store:1;
    unsigned is_branch:1;
    int pre_dedent:2;
    int post_indent:2;

From 8cb493acc70ffcbb244755434def6c7d41a6f6c1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 27 Sep 2015 01:23:38 -0400
Subject: [PATCH 225/241] tgsi: update atomic op docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Specify that the operation only applies to the x component, not
per-component as previously specified. This is unnecessary for GL and
creates additional complications for images which need to support these
operations as well.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/docs/source/tgsi.rst | 93 ++++++++++++++++----------------
 1 file changed, 47 insertions(+), 46 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 8fe971b2f7a..b6acc3fda95 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2264,11 +2264,11 @@ after lookup.
 Resource Access Opcodes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-.. opcode:: LOAD - Fetch data from a shader resource
+.. opcode:: LOAD - Fetch data from a shader buffer or image
 
                Syntax: ``LOAD dst, resource, address``
 
-               Example: ``LOAD TEMP[0], RES[0], TEMP[1]``
+               Example: ``LOAD TEMP[0], BUFFER[0], TEMP[1]``
 
                Using the provided integer address, LOAD fetches data
                from the specified buffer or texture without any
@@ -2292,7 +2292,7 @@ Resource Access Opcodes
 
                Syntax: ``STORE resource, address, src``
 
-               Example: ``STORE RES[0], TEMP[0], TEMP[1]``
+               Example: ``STORE BUFFER[0], TEMP[0], TEMP[1]``
 
                Using the provided integer address, STORE writes data
                to the specified buffer or texture.
@@ -2370,158 +2370,159 @@ These opcodes provide atomic variants of some common arithmetic and
 logical operations.  In this context atomicity means that another
 concurrent memory access operation that affects the same memory
 location is guaranteed to be performed strictly before or after the
-entire execution of the atomic operation.
-
-For the moment they're only valid in compute programs.
+entire execution of the atomic operation. The resource may be a buffer
+or an image. In the case of an image, the offset works the same as for
+``LOAD`` and ``STORE``, specified above. These atomic operations may
+only be used with 32-bit integer image formats.
 
 .. opcode:: ATOMUADD - Atomic integer addition
 
   Syntax: ``ATOMUADD dst, resource, offset, src``
 
-  Example: ``ATOMUADD TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUADD TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i + src_i
+  resource[offset] = dst_x + src_x
 
 
 .. opcode:: ATOMXCHG - Atomic exchange
 
   Syntax: ``ATOMXCHG dst, resource, offset, src``
 
-  Example: ``ATOMXCHG TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMXCHG TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = src_i
+  resource[offset] = src_x
 
 
 .. opcode:: ATOMCAS - Atomic compare-and-exchange
 
   Syntax: ``ATOMCAS dst, resource, offset, cmp, src``
 
-  Example: ``ATOMCAS TEMP[0], RES[0], TEMP[1], TEMP[2], TEMP[3]``
+  Example: ``ATOMCAS TEMP[0], BUFFER[0], TEMP[1], TEMP[2], TEMP[3]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i == cmp_i ? src_i : dst_i)
+  resource[offset] = (dst_x == cmp_x ? src_x : dst_x)
 
 
 .. opcode:: ATOMAND - Atomic bitwise And
 
   Syntax: ``ATOMAND dst, resource, offset, src``
 
-  Example: ``ATOMAND TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMAND TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i \& src_i
+  resource[offset] = dst_x \& src_x
 
 
 .. opcode:: ATOMOR - Atomic bitwise Or
 
   Syntax: ``ATOMOR dst, resource, offset, src``
 
-  Example: ``ATOMOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMOR TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i | src_i
+  resource[offset] = dst_x | src_x
 
 
 .. opcode:: ATOMXOR - Atomic bitwise Xor
 
   Syntax: ``ATOMXOR dst, resource, offset, src``
 
-  Example: ``ATOMXOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMXOR TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = dst_i \oplus src_i
+  resource[offset] = dst_x \oplus src_x
 
 
 .. opcode:: ATOMUMIN - Atomic unsigned minimum
 
   Syntax: ``ATOMUMIN dst, resource, offset, src``
 
-  Example: ``ATOMUMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUMIN TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+  resource[offset] = (dst_x < src_x ? dst_x : src_x)
 
 
 .. opcode:: ATOMUMAX - Atomic unsigned maximum
 
   Syntax: ``ATOMUMAX dst, resource, offset, src``
 
-  Example: ``ATOMUMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUMAX TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+  resource[offset] = (dst_x > src_x ? dst_x : src_x)
 
 
 .. opcode:: ATOMIMIN - Atomic signed minimum
 
   Syntax: ``ATOMIMIN dst, resource, offset, src``
 
-  Example: ``ATOMIMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMIMIN TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+  resource[offset] = (dst_x < src_x ? dst_x : src_x)
 
 
 .. opcode:: ATOMIMAX - Atomic signed maximum
 
   Syntax: ``ATOMIMAX dst, resource, offset, src``
 
-  Example: ``ATOMIMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMIMAX TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
 
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
 
 .. math::
 
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
 
-  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+  resource[offset] = (dst_x > src_x ? dst_x : src_x)
 
 
 

From 266d001261b19c6124e10c05cf3d8054b2db380b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 26 Sep 2015 20:27:42 -0400
Subject: [PATCH 226/241] gallium: add PIPE_SHADER_CAP_MAX_SHADER_BUFFERS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_limits.h    | 1 +
 src/gallium/auxiliary/tgsi/tgsi_exec.h           | 1 +
 src/gallium/docs/source/screen.rst               | 4 ++++
 src/gallium/drivers/freedreno/freedreno_screen.c | 2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 2 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 2 ++
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 2 ++
 src/gallium/drivers/svga/svga_screen.c           | 3 +++
 src/gallium/drivers/vc4/vc4_screen.c             | 2 ++
 src/gallium/include/pipe/p_defines.h             | 1 +
 13 files changed, 23 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index ad64ae058b6..4598db851ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -136,6 +136,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index f86adcec506..26fec8e2142 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -473,6 +473,7 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
       return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index fc08bb9ac34..11365b2f93c 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -389,6 +389,10 @@ to be 0.
   of iterations that loops are allowed to have to be unrolled. It is only
   a hint to state trackers. Whether any loops will be unrolled is not
   guaranteed.
+* ``PIPE_SHADER_CAP_MAX_SHADER_BUFFERS``: Maximum number of memory buffers
+  (also used to implement atomic counters). Having this be non-0 also
+  implies support for the ``LOAD``, ``STORE``, and ``ATOM*`` TGSI
+  opcodes.
 
 
 .. _pipe_compute_cap:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index e940b1c21e6..4536b04e04a 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -420,6 +420,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return 0;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index e63767d8aa0..ee62e5da20f 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -271,6 +271,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -314,6 +315,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 68d2acd4bcd..161f227806f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -306,6 +306,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index e4749eed7ab..ee1f8038682 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -317,6 +317,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       return 16; /* would be 32 in linked (OpenGL-style) mode */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a376590ab61..0591c2f5f4b 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -310,6 +310,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
@@ -368,6 +369,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
             return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
             return 32;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 9e5824202aa..877088d2224 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -528,6 +528,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
 		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		/* due to a bug in the shader compiler, some loops hang
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 50b23472467..30e220abe76 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -518,6 +518,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		return 1;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+		return 0;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 4285b1c2e00..466454a58f3 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -463,6 +463,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -521,6 +522,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
          return 0;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
@@ -612,6 +614,7 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 58f8ad9d510..e655192e051 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -351,6 +351,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+                return 0;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 8e48528944e..b0121132d1e 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -707,6 +707,7 @@ enum pipe_shader_cap
    PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED,
    PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE,
    PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT,
+   PIPE_SHADER_CAP_MAX_SHADER_BUFFERS,
 };
 
 /**

From ebfb5446c78cd3e9f3f3e92fef88e4b0645a34e8 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 2 Jan 2016 21:56:45 -0500
Subject: [PATCH 227/241] gallium: add PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/docs/source/screen.rst            |  4 +++
 .../drivers/freedreno/freedreno_screen.c      |  1 +
 src/gallium/drivers/i915/i915_screen.c        |  1 +
 src/gallium/drivers/ilo/ilo_screen.c          |  1 +
 src/gallium/drivers/llvmpipe/lp_screen.c      |  1 +
 .../drivers/nouveau/nv30/nv30_screen.c        |  1 +
 .../drivers/nouveau/nv50/nv50_screen.c        |  1 +
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  1 +
 src/gallium/drivers/r300/r300_screen.c        |  1 +
 src/gallium/drivers/r600/r600_pipe.c          |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c        |  1 +
 src/gallium/drivers/softpipe/sp_screen.c      |  1 +
 src/gallium/drivers/svga/svga_screen.c        |  1 +
 src/gallium/drivers/vc4/vc4_screen.c          | 27 ++++++++++---------
 src/gallium/drivers/virgl/virgl_screen.c      |  1 +
 src/gallium/include/pipe/p_defines.h          |  1 +
 16 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 11365b2f93c..c8f5f6a461e 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -297,6 +297,10 @@ The integer capabilities:
 * ``PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL``: If state trackers should use
   a system value for the FACE fragment shader input.
   Also, the FACE system value is integer, not float.
+* ``PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT``: Describes the required
+  alignment for pipe_shader_buffer::buffer_offset, in bytes. Maximum
+  value allowed is 256 (for GL conformance). 0 is only allowed if
+  shader buffers are not supported.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 4536b04e04a..9d0cdd8e545 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -244,6 +244,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 2289eb58c49..e2a493bc1b5 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -258,6 +258,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index c26d4492d3a..d5a82ce80ae 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -482,6 +482,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 1407b2688de..e29b008c7e8 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -307,6 +307,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index ee62e5da20f..d9c940232c4 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -180,6 +180,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 161f227806f..867b366c986 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -223,6 +223,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index ee1f8038682..9a95a54f9cf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -212,6 +212,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 0591c2f5f4b..d1b59ab4345 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -206,6 +206,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
         case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
         case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 877088d2224..e61d9286542 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -354,6 +354,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 30e220abe76..c2ca94339ac 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -346,6 +346,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 36510d5eb40..29e392b94e8 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -257,6 +257,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 466454a58f3..0f41e4ea254 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -352,6 +352,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index e655192e051..0e289432bbe 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -182,19 +182,20 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-	case PIPE_CAP_DEPTH_BOUNDS_TEST:
-	case PIPE_CAP_TGSI_TXQS:
-	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-	case PIPE_CAP_SHAREABLE_SHADERS:
-	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-	case PIPE_CAP_CLEAR_TEXTURE:
-	case PIPE_CAP_DRAW_PARAMETERS:
-	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
-	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
+        case PIPE_CAP_TGSI_TXQS:
+        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_SHAREABLE_SHADERS:
+        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index bf048da42a3..e8d82b37c0f 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -225,6 +225,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b0121132d1e..dd76fe553e4 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -641,6 +641,7 @@ enum pipe_cap
    PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS,
    PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL,
    PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL,
+   PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From 90ba06618ef9fe22bb4be4604a233c9a3a0ea1f7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 13 Dec 2015 22:11:25 -0500
Subject: [PATCH 228/241] gallium: add a RESQ opcode to query info about a
 resource
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_info.c     |  2 +-
 src/gallium/docs/source/tgsi.rst           | 12 ++++++++++++
 src/gallium/include/pipe/p_shader_tokens.h |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index b94aa63cac5..b270dd73b67 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -142,7 +142,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
    { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
    { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
-   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 105 },     /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, NONE, "RESQ", TGSI_OPCODE_RESQ },
    { 0, 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
    { 0, 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
    { 1, 2, 0, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index b6acc3fda95..7810a3eb915 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2311,6 +2311,18 @@ Resource Access Opcodes
                texture arrays and 2D textures.  address.w is always
                ignored.
 
+.. opcode:: RESQ - Query information about a resource
+
+  Syntax: ``RESQ dst, resource``
+
+  Example: ``RESQ TEMP[0], BUFFER[0]``
+
+  Returns information about the buffer or image resource. For buffer
+  resources, the size (in bytes) is returned in the x component. For
+  image resources, .xyz will contain the width/height/layers of the
+  image, while .w will contain the number of samples for multi-sampled
+  images.
+
 
 .. _threadsyncopcodes:
 
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 43a5561882c..f300207d4dd 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -411,6 +411,7 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_ENDSUB              102
 #define TGSI_OPCODE_TXQ_LZ              103 /* TXQ for mipmap level 0 */
 #define TGSI_OPCODE_TXQS                104
+#define TGSI_OPCODE_RESQ                105
                                 /* gap */
 #define TGSI_OPCODE_NOP                 107
 

From dff1caccac370adb1c03609b7bfe12b9195b5a61 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 8 Jan 2016 15:09:26 -0500
Subject: [PATCH 229/241] freedreno: add ir3_compiler to gitignore

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/freedreno/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 src/gallium/drivers/freedreno/.gitignore

diff --git a/src/gallium/drivers/freedreno/.gitignore b/src/gallium/drivers/freedreno/.gitignore
new file mode 100644
index 00000000000..150f5d19f5b
--- /dev/null
+++ b/src/gallium/drivers/freedreno/.gitignore
@@ -0,0 +1 @@
+ir3_compiler

From 5d349fab46a579b348bd8f1ab34169affa7287f0 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Thu, 29 Oct 2015 15:56:18 -0700
Subject: [PATCH 230/241] mesa: docs: Add link to planet.freedesktop.org

The freedesktop.org blog feeds aren't mentioned on either mesa3d.org or
any of the graphics project wikis (including the DRI wiki) on
freedeskop.org.  Fix that by linking to it from the sidebar.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 docs/contents.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/contents.html b/docs/contents.html
index 5e0f514c1e1..294804f6f5a 100644
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -90,6 +90,7 @@
 <li><a href="http://www.opengl.org" target="_parent">OpenGL website</a>
 <li><a href="http://dri.freedesktop.org" target="_parent">DRI website</a>
 <li><a href="http://www.freedesktop.org" target="_parent">freedesktop.org</a>
+<li><a href="http://planet.freedesktop.org" target="_parent">Developer blogs</a>
 </ul>
 
 <b>Hosted by:</b>

From cf66a8ffb75a7881f03222b2345c77f3b0be7e64 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Fri, 23 Oct 2015 17:07:42 -0700
Subject: [PATCH 231/241] mesa: Map program UBOs and SSBOs to Interface Blocks

v2:
 * Fill UboInterfaceBlockIndex and SsboInterfaceBlockIndex in
   split_ubos_and_ssbos (Iago)

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/linker.cpp                 | 30 ++++++++++++++++++++++++-----
 src/glsl/standalone_scaffolding.cpp |  5 +++++
 src/mesa/main/mtypes.h              |  7 +++++++
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 7a18523fe23..418bd09e49e 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3921,8 +3921,10 @@ split_ubos_and_ssbos(void *mem_ctx,
                      unsigned num_blocks,
                      struct gl_uniform_block ***ubos,
                      unsigned *num_ubos,
+                     unsigned **ubo_interface_block_indices,
                      struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos)
+                     unsigned *num_ssbos,
+                     unsigned **ssbo_interface_block_indices)
 {
    unsigned num_ubo_blocks = 0;
    unsigned num_ssbo_blocks = 0;
@@ -3940,11 +3942,25 @@ split_ubos_and_ssbos(void *mem_ctx,
    *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
    *num_ssbos = 0;
 
+   if (ubo_interface_block_indices)
+      *ubo_interface_block_indices =
+         ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
+
+   if (ssbo_interface_block_indices)
+      *ssbo_interface_block_indices =
+         ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
+
    for (unsigned i = 0; i < num_blocks; i++) {
       if (blocks[i].IsShaderStorage) {
-         (*ssbos)[(*num_ssbos)++] = &blocks[i];
+         (*ssbos)[*num_ssbos] = &blocks[i];
+         if (ssbo_interface_block_indices)
+            (*ssbo_interface_block_indices)[*num_ssbos] = i;
+         (*num_ssbos)++;
       } else {
-         (*ubos)[(*num_ubos)++] = &blocks[i];
+         (*ubos)[*num_ubos] = &blocks[i];
+         if (ubo_interface_block_indices)
+            (*ubo_interface_block_indices)[*num_ubos] = i;
+         (*num_ubos)++;
       }
    }
 
@@ -4536,8 +4552,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                               sh->NumBufferInterfaceBlocks,
                               &sh->UniformBlocks,
                               &sh->NumUniformBlocks,
+                              NULL,
                               &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks);
+                              &sh->NumShaderStorageBlocks,
+                              NULL);
       }
    }
 
@@ -4546,8 +4564,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                         prog->NumBufferInterfaceBlocks,
                         &prog->UniformBlocks,
                         &prog->NumUniformBlocks,
+                        &prog->UboInterfaceBlockIndex,
                         &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks);
+                        &prog->NumShaderStorageBlocks,
+                        &prog->SsboInterfaceBlockIndex);
 
    /* FINISHME: Assign fragment shader output locations. */
 
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index e350f702099..d5d214b57cc 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -124,6 +124,11 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
       shProg->InterfaceBlockStageIndex[i] = NULL;
    }
 
+   ralloc_free(shProg->UboInterfaceBlockIndex);
+   shProg->UboInterfaceBlockIndex = NULL;
+   ralloc_free(shProg->SsboInterfaceBlockIndex);
+   shProg->SsboInterfaceBlockIndex = NULL;
+
    ralloc_free(shProg->AtomicBuffers);
    shProg->AtomicBuffers = NULL;
    shProg->NumAtomicBuffers = 0;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 41f5283679d..8951774e714 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2735,6 +2735,13 @@ struct gl_shader_program
     */
    int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
 
+   /**
+    * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer
+    * Objects and Shader Storage Buffer Objects.
+    */
+   unsigned *UboInterfaceBlockIndex;
+   unsigned *SsboInterfaceBlockIndex;
+
    /**
     * Map of active uniform names to locations
     *

From 1d54ac6c9f41df240497e96770415b019ffdc6b3 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Fri, 23 Oct 2015 17:08:33 -0700
Subject: [PATCH 232/241] mesa: Use separate indices for UBO & SSBO during
 binding

Previously we were treating the binding index for Uniform Buffer
Objects and Shader Storage Buffer Objects as being part of the
combined BufferInterfaceBlocks array.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93322
Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/main/uniforms.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 758ca2456df..47f80ce2001 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1002,10 +1002,10 @@ _mesa_UniformBlockBinding(GLuint program,
    if (!shProg)
       return;
 
-   if (uniformBlockIndex >= shProg->NumBufferInterfaceBlocks) {
+   if (uniformBlockIndex >= shProg->NumUniformBlocks) {
       _mesa_error(ctx, GL_INVALID_VALUE,
 		  "glUniformBlockBinding(block index %u >= %u)",
-		  uniformBlockIndex, shProg->NumBufferInterfaceBlocks);
+		  uniformBlockIndex, shProg->NumUniformBlocks);
       return;
    }
 
@@ -1016,17 +1016,22 @@ _mesa_UniformBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding !=
+   if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
        uniformBlockBinding) {
       int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
 
-      shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
+      const int interface_block_index =
+         shProg->UboInterfaceBlockIndex[uniformBlockIndex];
+
+      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+         uniformBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index = shProg->InterfaceBlockStageIndex[i][uniformBlockIndex];
+	 int stage_index =
+            shProg->InterfaceBlockStageIndex[i][interface_block_index];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
@@ -1054,10 +1059,10 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
    if (!shProg)
       return;
 
-   if (shaderStorageBlockIndex >= shProg->NumBufferInterfaceBlocks) {
+   if (shaderStorageBlockIndex >= shProg->NumShaderStorageBlocks) {
       _mesa_error(ctx, GL_INVALID_VALUE,
 		  "glShaderStorageBlockBinding(block index %u >= %u)",
-		  shaderStorageBlockIndex, shProg->NumBufferInterfaceBlocks);
+		  shaderStorageBlockIndex, shProg->NumShaderStorageBlocks);
       return;
    }
 
@@ -1069,17 +1074,22 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding !=
+   if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
        shaderStorageBlockBinding) {
       int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
+      const int interface_block_index =
+         shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex];
+
+      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+         shaderStorageBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index = shProg->InterfaceBlockStageIndex[i][shaderStorageBlockIndex];
+	 int stage_index =
+            shProg->InterfaceBlockStageIndex[i][interface_block_index];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];

From e97caba1f6c2bd803f9c8b969b52c21f93daf1d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg=20Kristensen?=
 <kristian.h.kristensen@intel.com>
Date: Fri, 8 Jan 2016 12:35:18 -0800
Subject: [PATCH 233/241] glsl: Move glsl_to_nir files to LIBGLSL_FILES

libglsl_la_SOURCES includes both NIR_FILES and LIBGLSL_FILES, so for
libglsl.la consumers, this is a no-op. libnir.la however no longer uses
any GLSL IR infrastructure and can be used without also linking to
libglsl.la.

Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/Makefile.sources | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index fc10f14f4c1..4da9b072892 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -18,8 +18,6 @@ NIR_GENERATED_FILES = \
 	nir/nir_opt_algebraic.c
 
 NIR_FILES = \
-	nir/glsl_to_nir.cpp \
-	nir/glsl_to_nir.h \
 	nir/glsl_types.cpp \
 	nir/glsl_types.h \
 	nir/builtin_type_macros.h \
@@ -184,6 +182,8 @@ LIBGLSL_FILES = \
 	lower_output_reads.cpp \
 	lower_shared_reference.cpp \
 	lower_ubo_reference.cpp \
+	nir/glsl_to_nir.cpp \
+	nir/glsl_to_nir.h \
 	opt_algebraic.cpp \
 	opt_array_splitting.cpp \
 	opt_conditional_discard.cpp \

From 1d25ef6ae7717cb9720efc1f3a54591d2ff0e355 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg=20Kristensen?=
 <kristian.h.kristensen@intel.com>
Date: Fri, 8 Jan 2016 12:35:38 -0800
Subject: [PATCH 234/241] i965: Move GLSL lowering passes out of
 libi965_compiler.la

The scope of libi965_compiler.la is to be able to take nir shaders and
generate i965 EU code.  As such, we don't want the GLSL IR lowering
passes in the library. With this change, libi965_compiler.la no longer
needs to link to libglsl.la.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 05c49ee9a12..5aeeca57f37 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -2,7 +2,6 @@ i965_compiler_FILES = \
 	brw_cfg.cpp \
 	brw_cfg.h \
 	brw_compiler.h \
-	brw_cubemap_normalize.cpp \
 	brw_dead_control_flow.cpp \
 	brw_dead_control_flow.h \
 	brw_defines.h \
@@ -16,7 +15,6 @@ i965_compiler_FILES = \
 	brw_eu_util.c \
 	brw_eu_validate.c \
 	brw_fs_builder.h \
-	brw_fs_channel_expressions.cpp \
 	brw_fs_cmod_propagation.cpp \
 	brw_fs_combine_constants.cpp \
 	brw_fs_copy_propagation.cpp \
@@ -35,15 +33,12 @@ i965_compiler_FILES = \
 	brw_fs_surface_builder.cpp \
 	brw_fs_surface_builder.h \
 	brw_fs_validate.cpp \
-	brw_fs_vector_splitting.cpp \
 	brw_fs_visitor.cpp \
 	brw_inst.h \
 	brw_interpolation_map.c \
 	brw_ir_allocator.h \
 	brw_ir_fs.h \
 	brw_ir_vec4.h \
-	brw_lower_texture_gradients.cpp \
-	brw_lower_unnormalized_offset.cpp \
 	brw_nir.h \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
@@ -114,6 +109,7 @@ i965_FILES = \
 	brw_context.h \
 	brw_cs.c \
 	brw_cs.h \
+	brw_cubemap_normalize.cpp \
 	brw_curbe.c \
 	brw_draw.c \
 	brw_draw.h \
@@ -121,11 +117,15 @@ i965_FILES = \
 	brw_ff_gs.c \
 	brw_ff_gs_emit.c \
 	brw_ff_gs.h \
+	brw_fs_channel_expressions.cpp \
+	brw_fs_vector_splitting.cpp \
 	brw_gs.c \
 	brw_gs.h \
 	brw_gs_state.c \
 	brw_gs_surface_state.c \
 	brw_link.cpp \
+	brw_lower_texture_gradients.cpp \
+	brw_lower_unnormalized_offset.cpp \
 	brw_meta_fast_clear.c \
 	brw_meta_stencil_blit.c \
 	brw_meta_updownsample.c \

From 82ad571abf2fa2d85047451690f6a335f66d25fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg=20Kristensen?=
 <kristian.h.kristensen@intel.com>
Date: Fri, 8 Jan 2016 12:35:48 -0800
Subject: [PATCH 235/241] glsl: Move _mesa_shader_stage_to_string/abbrev to
 shader_enums.c

These are used by code that doesn't necessarily link to libglsl.la. Move
them to shader_enums.[ch] where we keep similar helpers.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/glsl_parser_extras.cpp               | 38 -------------------
 src/glsl/glsl_parser_extras.h                 | 10 -----
 src/glsl/nir/shader_enums.c                   | 36 ++++++++++++++++++
 src/glsl/nir/shader_enums.h                   | 20 ++++++++++
 src/mesa/drivers/dri/i965/brw_link.cpp        |  1 -
 src/mesa/drivers/dri/i965/brw_shader.cpp      |  1 -
 .../drivers/dri/i965/brw_vec4_generator.cpp   |  1 -
 src/mesa/drivers/dri/i965/gen6_vs_state.c     |  1 -
 8 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 8c46f147941..1d74db54869 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -412,44 +412,6 @@ _mesa_glsl_parse_state::process_version_directive(YYLTYPE *locp, int version,
 }
 
 
-/**
- * Translate a gl_shader_stage to a short shader stage name for debug
- * printouts and error messages.
- */
-const char *
-_mesa_shader_stage_to_string(unsigned stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:   return "vertex";
-   case MESA_SHADER_FRAGMENT: return "fragment";
-   case MESA_SHADER_GEOMETRY: return "geometry";
-   case MESA_SHADER_COMPUTE:  return "compute";
-   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
-   case MESA_SHADER_TESS_EVAL: return "tess eval";
-   }
-
-   unreachable("Unknown shader stage.");
-}
-
-/**
- * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
- * for debug printouts and error messages.
- */
-const char *
-_mesa_shader_stage_to_abbrev(unsigned stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:   return "VS";
-   case MESA_SHADER_FRAGMENT: return "FS";
-   case MESA_SHADER_GEOMETRY: return "GS";
-   case MESA_SHADER_COMPUTE:  return "CS";
-   case MESA_SHADER_TESS_CTRL: return "TCS";
-   case MESA_SHADER_TESS_EVAL: return "TES";
-   }
-
-   unreachable("Unknown shader stage.");
-}
-
 /* This helper function will append the given message to the shader's
    info log and report it via GL_ARB_debug_output. Per that extension,
    'type' is one of the enum values classifying the message, and
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index afb99afa5cd..ecc29920918 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -731,16 +731,6 @@ extern bool _mesa_glsl_process_extension(const char *name, YYLTYPE *name_locp,
 extern "C" {
 #endif
 
-/**
- * Get the textual name of the specified shader stage (which is a
- * gl_shader_stage).
- */
-extern const char *
-_mesa_shader_stage_to_string(unsigned stage);
-
-extern const char *
-_mesa_shader_stage_to_abbrev(unsigned stage);
-
 extern int glcpp_preprocess(void *ctx, const char **shader, char **info_log,
                       const struct gl_extensions *extensions, struct gl_context *gl_ctx);
 
diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c
index 66a25e72344..10f546a9814 100644
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -47,6 +47,42 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
    return NAME(stage);
 }
 
+/**
+ * Translate a gl_shader_stage to a short shader stage name for debug
+ * printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_string(unsigned stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:   return "vertex";
+   case MESA_SHADER_FRAGMENT: return "fragment";
+   case MESA_SHADER_GEOMETRY: return "geometry";
+   case MESA_SHADER_COMPUTE:  return "compute";
+   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
+   case MESA_SHADER_TESS_EVAL: return "tess eval";
+   }
+
+   unreachable("Unknown shader stage.");
+}
+
+/**
+ * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
+ * for debug printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_abbrev(unsigned stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:   return "VS";
+   case MESA_SHADER_FRAGMENT: return "FS";
+   case MESA_SHADER_GEOMETRY: return "GS";
+   case MESA_SHADER_COMPUTE:  return "CS";
+   case MESA_SHADER_TESS_CTRL: return "TCS";
+   case MESA_SHADER_TESS_EVAL: return "TES";
+   }
+
+   unreachable("Unknown shader stage.");
+}
+
 const char * gl_vert_attrib_name(gl_vert_attrib attrib)
 {
    static const char *names[] = {
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h
index 8a2a81a333d..c747464d094 100644
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -26,6 +26,10 @@
 #ifndef SHADER_ENUMS_H
 #define SHADER_ENUMS_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * Shader stages. Note that these will become 5 with tessellation.
  *
@@ -45,6 +49,18 @@ typedef enum
 
 const char * gl_shader_stage_name(gl_shader_stage stage);
 
+/**
+ * Translate a gl_shader_stage to a short shader stage name for debug
+ * printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_string(unsigned stage);
+
+/**
+ * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
+ * for debug printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_abbrev(unsigned stage);
+
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
 
 
@@ -519,4 +535,8 @@ enum gl_frag_depth_layout
    FRAG_DEPTH_LAYOUT_UNCHANGED
 };
 
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
 #endif /* SHADER_ENUMS_H */
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 7cdc830f6b8..766c57ff60a 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -27,7 +27,6 @@
 #include "brw_nir.h"
 #include "brw_program.h"
 #include "glsl/ir_optimization.h"
-#include "glsl/glsl_parser_extras.h"
 #include "program/program.h"
 #include "main/shaderapi.h"
 #include "main/uniforms.h"
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 49ff835fa85..efc24f92f58 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -27,7 +27,6 @@
 #include "brw_fs.h"
 #include "brw_nir.h"
 #include "brw_vec4_tes.h"
-#include "glsl/glsl_parser_extras.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
 #include "util/debug.h"
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 6b03a1c3db5..8173202a752 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -20,7 +20,6 @@
  * IN THE SOFTWARE.
  */
 
-#include "glsl/glsl_parser_extras.h"
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 #include "brw_eu.h"
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index da3b4cd90e8..4bc0a8598d6 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -32,7 +32,6 @@
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
 #include "intel_batchbuffer.h"
-#include "glsl/glsl_parser_extras.h"
 
 /**
  * Creates a streamed BO containing the push constants for the VS or GS on

From e3706a7118cb572077c877b92450dc955e7a6754 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 8 Jan 2016 17:32:56 -0500
Subject: [PATCH 236/241] nv50,nvc0: use a face sysval to avoid the useless
 back-and-forth conversion

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 2 --
 src/gallium/drivers/nouveau/nv50/nv50_program.c           | 4 ----
 src/gallium/drivers/nouveau/nv50/nv50_screen.c            | 2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c           | 1 -
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c            | 2 +-
 5 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index e3db975b26f..7b313f3c39c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1660,8 +1660,6 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          // don't load masked inputs, won't be assigned a slot
          if (!ptr && !(info->in[idx].mask & (1 << swz)))
             return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
-         if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
-            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
          return interpolate(src, c, shiftAddress(ptr));
       } else
       if (prog->getType() == Program::TYPE_GEOMETRY) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index b63584e0a09..888d62e1c52 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -148,7 +148,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
    for (m = 0, i = 0; i < info->numInputs; ++i) {
       switch (info->in[i].sn) {
       case TGSI_SEMANTIC_POSITION:
-      case TGSI_SEMANTIC_FACE:
          continue;
       default:
          m += info->in[i].flat ? 0 : 1;
@@ -166,9 +165,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
          for (c = 0; c < 4; ++c)
             if (info->in[i].mask & (1 << c))
                info->in[i].slot[c] = nintp++;
-      } else
-      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
-         info->in[i].slot[0] = 255;
       } else {
          unsigned j = info->in[i].flat ? m++ : n++;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 867b366c986..56c67e0ddfb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -222,7 +223,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 7cb86e3b905..c3b53621630 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -55,7 +55,6 @@ nvc0_shader_input_address(unsigned sn, unsigned si)
    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
-   case TGSI_SEMANTIC_FACE:         return 0x3fc;
    default:
       assert(!"invalid TGSI input semantic");
       return ~0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 9a95a54f9cf..33dd17ebeca 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -188,6 +188,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -211,7 +212,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 0;
 

From 81f7fd3c54fa231081ceaf6e17af23e34a786c58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg=20Kristensen?=
 <kristian.h.kristensen@intel.com>
Date: Fri, 8 Jan 2016 15:23:56 -0800
Subject: [PATCH 237/241] glsl: Don't add nir files to libglsl_la_SOURCES

SCons doesn't understand nir yet and doesn't want to compile the glsl to
nir pass. Move the files to their own variable so we can add it only for
automake.

Tested-by: Brian Paul <brianp@vmware.com>
---
 src/glsl/Makefile.am      | 4 ++--
 src/glsl/Makefile.sources | 7 +++++--
 src/glsl/SConscript       | 1 +
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 33a34e4ccc8..95efdb327c1 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -144,8 +144,8 @@ libglsl_la_SOURCES =					\
 	glsl_parser.h					\
 	$(LIBGLSL_FILES)				\
 	$(NIR_FILES)					\
-	$(NIR_GENERATED_FILES)
-
+	$(NIR_GENERATED_FILES)				\
+	$(GLSL_TO_NIR_FILES)
 
 libnir_la_SOURCES =					\
 	$(NIR_FILES)					\
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 4da9b072892..fd28f5cedda 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -182,8 +182,6 @@ LIBGLSL_FILES = \
 	lower_output_reads.cpp \
 	lower_shared_reference.cpp \
 	lower_ubo_reference.cpp \
-	nir/glsl_to_nir.cpp \
-	nir/glsl_to_nir.h \
 	opt_algebraic.cpp \
 	opt_array_splitting.cpp \
 	opt_conditional_discard.cpp \
@@ -213,6 +211,11 @@ LIBGLSL_FILES = \
 	s_expression.cpp \
 	s_expression.h
 
+# glsl to nir pass
+GLSL_TO_NIR_FILES = \
+	nir/glsl_to_nir.cpp \
+	nir/glsl_to_nir.h
+
 # glsl_compiler
 
 GLSL_COMPILER_CXX_FILES = \
diff --git a/src/glsl/SConscript b/src/glsl/SConscript
index 70bf5b09c3c..a9d38c163b7 100644
--- a/src/glsl/SConscript
+++ b/src/glsl/SConscript
@@ -65,6 +65,7 @@ for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'):
 # XXX: Remove this once we build NIR and NIR_FILES.
 glsl_sources += [
     'nir/glsl_types.cpp',
+    'nir/shader_enums.c',
 ]
 
 if env['msvc']:

From b3ca26cded7f0930a292d66d16e457a940da0c2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Jan 2016 15:47:01 -0500
Subject: [PATCH 238/241] mesa: merge bind_xfb_buffers_{base|range}

Reduced code duplication should make the code more maintainable.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/bufferobj.c | 172 +++++++++++++-------------------------
 1 file changed, 59 insertions(+), 113 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index d7c5680661b..aaca13c533d 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3537,84 +3537,19 @@ unbind_xfb_buffers(struct gl_context *ctx,
 }
 
 static void
-bind_xfb_buffers_base(struct gl_context *ctx,
-                      GLuint first, GLsizei count,
-                      const GLuint *buffers)
-{
-   struct gl_transform_feedback_object *tfObj =
-      ctx->TransformFeedback.CurrentObject;
-   GLint i;
-
-   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count,
-                                     "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewTransformFeedback;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_xfb_buffers(ctx, tfObj, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_buffer_object * const boundBufObj = tfObj->Buffers[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (boundBufObj && boundBufObj->Name == buffers[i])
-         bufObj = boundBufObj;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj)
-         _mesa_set_transform_feedback_binding(ctx, tfObj, first + i,
-                                              bufObj, 0, 0);
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_xfb_buffers_range(struct gl_context *ctx,
-                       GLuint first, GLsizei count,
-                       const GLuint *buffers,
-                       const GLintptr *offsets,
-                       const GLsizeiptr *sizes)
+bind_xfb_buffers(struct gl_context *ctx,
+                 GLuint first, GLsizei count,
+                 const GLuint *buffers,
+                 bool range,
+                 const GLintptr *offsets,
+                 const GLsizeiptr *sizes,
+                 const char *caller)
 {
    struct gl_transform_feedback_object *tfObj =
        ctx->TransformFeedback.CurrentObject;
    GLint i;
 
-   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count,
-                                     "glBindBuffersRange"))
+   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count, caller))
       return;
 
    /* Assume that at least one binding will be changed */
@@ -3659,55 +3594,64 @@ bind_xfb_buffers_range(struct gl_context *ctx,
       const GLuint index = first + i;
       struct gl_buffer_object * const boundBufObj = tfObj->Buffers[index];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         offset = offsets[i];
+         size = sizes[i];
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Transform feedback array bindings (see sec. 13.2.2)           │
-       *      ├───────────────────────┬───────────────────────────────────────┤
-       *      │    ...                │    ...                                │
-       *      │    offset restriction │    multiple of 4                      │
-       *      │    ...                │    ...                                │
-       *      │    size restriction   │    multiple of 4                      │
-       *      └───────────────────────┴───────────────────────────────────────┘"
-       */
-      if (offsets[i] & 0x3) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of 4 when "
-                     "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
-                     i, (int64_t) offsets[i]);
-         continue;
-      }
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      if (sizes[i] & 0x3) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(sizes[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of 4 when "
-                     "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
-                     i, (int64_t) sizes[i]);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Transform feedback array bindings (see sec. 13.2.2)           │
+          *      ├───────────────────────┬───────────────────────────────────────┤
+          *      │    ...                │    ...                                │
+          *      │    offset restriction │    multiple of 4                      │
+          *      │    ...                │    ...                                │
+          *      │    size restriction   │    multiple of 4                      │
+          *      └───────────────────────┴───────────────────────────────────────┘"
+          */
+         if (offsets[i] & 0x3) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of 4 when "
+                        "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
+                        i, (int64_t) offsets[i]);
+            continue;
+         }
+
+         if (sizes[i] & 0x3) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(sizes[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of 4 when "
+                        "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
+                        i, (int64_t) sizes[i]);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (boundBufObj && boundBufObj->Name == buffers[i])
          bufObj = boundBufObj;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj)
          _mesa_set_transform_feedback_binding(ctx, tfObj, index, bufObj,
-                                              offsets[i], sizes[i]);
+                                              offset, size);
    }
 
    _mesa_end_bufferobj_lookups(ctx);
@@ -4068,7 +4012,8 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
 
    switch (target) {
    case GL_TRANSFORM_FEEDBACK_BUFFER:
-      bind_xfb_buffers_range(ctx, first, count, buffers, offsets, sizes);
+      bind_xfb_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                       "glBindBuffersRange");
       return;
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
@@ -4101,7 +4046,8 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
 
    switch (target) {
    case GL_TRANSFORM_FEEDBACK_BUFFER:
-      bind_xfb_buffers_base(ctx, first, count, buffers);
+      bind_xfb_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                       "glBindBuffersBase");
       return;
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_base(ctx, first, count, buffers);

From e8dd7cc303b03b3d7df91b12d667f82b06a910f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Jan 2016 17:20:57 -0500
Subject: [PATCH 239/241] mesa: merge bind_uniform_buffers_{base|range}

Reduced code duplication should make the code more maintainable.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/bufferobj.c | 158 ++++++++++++--------------------------
 1 file changed, 50 insertions(+), 108 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index aaca13c533d..9a7e41301c5 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3128,72 +3128,6 @@ unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
                        bufObj, -1, -1, GL_TRUE);
 }
 
-static void
-bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
-                          const GLuint *buffers)
-{
-   GLint i;
-
-   if (!error_check_bind_uniform_buffers(ctx, first, count, "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_uniform_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_uniform_buffer_binding *binding =
-          &ctx->UniformBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj) {
-         if (bufObj == ctx->Shared->NullBufferObj)
-            set_ubo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
-         else
-            set_ubo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
-      }
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
 static void
 bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
                                  GLsizei count, const GLuint *buffers)
@@ -3262,14 +3196,15 @@ bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
 }
 
 static void
-bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
-                           const GLuint *buffers,
-                           const GLintptr *offsets, const GLsizeiptr *sizes)
+bind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count,
+                     const GLuint *buffers,
+                     bool range,
+                     const GLintptr *offsets, const GLsizeiptr *sizes,
+                     const char *caller)
 {
    GLint i;
 
-   if (!error_check_bind_uniform_buffers(ctx, first, count,
-                                         "glBindBuffersRange"))
+   if (!error_check_bind_uniform_buffers(ctx, first, count, caller))
       return;
 
    /* Assume that at least one binding will be changed */
@@ -3314,52 +3249,57 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
       struct gl_uniform_buffer_binding *binding =
          &ctx->UniformBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Uniform buffer array bindings (see sec. 7.6)                  │
-       *      ├─────────────────────┬─────────────────────────────────────────┤
-       *      │  ...                │  ...                                    │
-       *      │  offset restriction │  multiple of value of UNIFORM_BUFFER_-  │
-       *      │                     │  OFFSET_ALIGNMENT                       │
-       *      │  ...                │  ...                                    │
-       *      │  size restriction   │  none                                   │
-       *      └─────────────────────┴─────────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ctx->Const.UniformBufferOffsetAlignment - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of the value of "
-                     "GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT=%u when "
-                     "target=GL_UNIFORM_BUFFER)",
-                     i, (int64_t) offsets[i],
-                     ctx->Const.UniformBufferOffsetAlignment);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Uniform buffer array bindings (see sec. 7.6)                  │
+          *      ├─────────────────────┬─────────────────────────────────────────┤
+          *      │  ...                │  ...                                    │
+          *      │  offset restriction │  multiple of value of UNIFORM_BUFFER_-  │
+          *      │                     │  OFFSET_ALIGNMENT                       │
+          *      │  ...                │  ...                                    │
+          *      │  size restriction   │  none                                   │
+          *      └─────────────────────┴─────────────────────────────────────────┘"
+          */
+         if (offsets[i] & (ctx->Const.UniformBufferOffsetAlignment - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of the value of "
+                        "GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT=%u when "
+                        "target=GL_UNIFORM_BUFFER)",
+                        i, (int64_t) offsets[i],
+                        ctx->Const.UniformBufferOffsetAlignment);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
          bufObj = binding->BufferObject;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj) {
          if (bufObj == ctx->Shared->NullBufferObj)
-            set_ubo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+            set_ubo_binding(ctx, binding, bufObj, -1, -1, !range);
          else
-            set_ubo_binding(ctx, binding, bufObj,
-                            offsets[i], sizes[i], GL_FALSE);
+            set_ubo_binding(ctx, binding, bufObj, offset, size, !range);
       }
    }
 
@@ -4016,7 +3956,8 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
                        "glBindBuffersRange");
       return;
    case GL_UNIFORM_BUFFER:
-      bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
+      bind_uniform_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                           "glBindBuffersRange");
       return;
    case GL_SHADER_STORAGE_BUFFER:
       bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
@@ -4050,7 +3991,8 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
                        "glBindBuffersBase");
       return;
    case GL_UNIFORM_BUFFER:
-      bind_uniform_buffers_base(ctx, first, count, buffers);
+      bind_uniform_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                           "glBindBuffersBase");
       return;
    case GL_SHADER_STORAGE_BUFFER:
       bind_shader_storage_buffers_base(ctx, first, count, buffers);

From 5eb104d6abc8a500e028de3689ea2d82bf5f97de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Jan 2016 17:26:14 -0500
Subject: [PATCH 240/241] mesa: merge bind_shader_storage_buffers_{base|range}

Reduced code duplication should make the code more maintainable.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/bufferobj.c | 162 ++++++++++++--------------------------
 1 file changed, 51 insertions(+), 111 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 9a7e41301c5..00680f5a8e6 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3128,73 +3128,6 @@ unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
                        bufObj, -1, -1, GL_TRUE);
 }
 
-static void
-bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
-                                 GLsizei count, const GLuint *buffers)
-{
-   GLint i;
-
-   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
-                                                "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_shader_storage_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_shader_storage_buffer_binding *binding =
-          &ctx->ShaderStorageBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj) {
-         if (bufObj == ctx->Shared->NullBufferObj)
-            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
-         else
-            set_ssbo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
-      }
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
 static void
 bind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count,
                      const GLuint *buffers,
@@ -3307,15 +3240,16 @@ bind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count,
 }
 
 static void
-bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
-                                  GLsizei count, const GLuint *buffers,
-                                  const GLintptr *offsets,
-                                  const GLsizeiptr *sizes)
+bind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
+                            GLsizei count, const GLuint *buffers,
+                            bool range,
+                            const GLintptr *offsets,
+                            const GLsizeiptr *sizes,
+                            const char *caller)
 {
    GLint i;
 
-   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
-                                                "glBindBuffersRange"))
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count, caller))
       return;
 
    /* Assume that at least one binding will be changed */
@@ -3360,52 +3294,57 @@ bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
       struct gl_shader_storage_buffer_binding *binding =
          &ctx->ShaderStorageBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Shader storage buffer array bindings (see sec. 7.8)           │
-       *      ├─────────────────────┬─────────────────────────────────────────┤
-       *      │  ...                │  ...                                    │
-       *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
-       *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
-       *      │  ...                │  ...                                    │
-       *      │  size restriction   │  none                                   │
-       *      └─────────────────────┴─────────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of the value of "
-                     "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
-                     "target=GL_SHADER_STORAGE_BUFFER)",
-                     i, (int64_t) offsets[i],
-                     ctx->Const.ShaderStorageBufferOffsetAlignment);
-         continue;
+         /* The ARB_multi_bind spec says:
+         *
+         *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+         *      pair of values in <offsets> and <sizes> does not respectively
+         *      satisfy the constraints described for those parameters for the
+         *      specified target, as described in section 6.7.1 (per binding)."
+         *
+         * Section 6.7.1 refers to table 6.5, which says:
+         *
+         *     "┌───────────────────────────────────────────────────────────────┐
+         *      │ Shader storage buffer array bindings (see sec. 7.8)           │
+         *      ├─────────────────────┬─────────────────────────────────────────┤
+         *      │  ...                │  ...                                    │
+         *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
+         *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
+         *      │  ...                │  ...                                    │
+         *      │  size restriction   │  none                                   │
+         *      └─────────────────────┴─────────────────────────────────────────┘"
+         */
+         if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of the value of "
+                        "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
+                        "target=GL_SHADER_STORAGE_BUFFER)",
+                        i, (int64_t) offsets[i],
+                        ctx->Const.ShaderStorageBufferOffsetAlignment);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
          bufObj = binding->BufferObject;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj) {
          if (bufObj == ctx->Shared->NullBufferObj)
-            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, !range);
          else
-            set_ssbo_binding(ctx, binding, bufObj,
-                             offsets[i], sizes[i], GL_FALSE);
+            set_ssbo_binding(ctx, binding, bufObj, offset, size, !range);
       }
    }
 
@@ -3960,8 +3899,8 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
                            "glBindBuffersRange");
       return;
    case GL_SHADER_STORAGE_BUFFER:
-      bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
-                                        sizes);
+      bind_shader_storage_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                                  "glBindBuffersRange");
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_range(ctx, first, count, buffers,
@@ -3995,7 +3934,8 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
                            "glBindBuffersBase");
       return;
    case GL_SHADER_STORAGE_BUFFER:
-      bind_shader_storage_buffers_base(ctx, first, count, buffers);
+      bind_shader_storage_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                                  "glBindBuffersBase");
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_base(ctx, first, count, buffers);

From da5d4583e53fc9cdc86aba7d2ac770e01baa158d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Jan 2016 17:30:18 -0500
Subject: [PATCH 241/241] mesa: merge bind_atomic_buffers_{base|range}

Reduced code duplication should make the code more maintainable.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/bufferobj.c | 155 ++++++++++++--------------------------
 1 file changed, 49 insertions(+), 106 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 00680f5a8e6..14ee8c8fc73 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3580,82 +3580,18 @@ unbind_atomic_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
 }
 
 static void
-bind_atomic_buffers_base(struct gl_context *ctx,
-                         GLuint first,
-                         GLsizei count,
-                         const GLuint *buffers)
+bind_atomic_buffers(struct gl_context *ctx,
+                    GLuint first,
+                    GLsizei count,
+                    const GLuint *buffers,
+                    bool range,
+                    const GLintptr *offsets,
+                    const GLsizeiptr *sizes,
+                    const char *caller)
 {
    GLint i;
 
-   if (!error_check_bind_atomic_buffers(ctx, first, count,
-                                        "glBindBuffersBase"))
-     return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewAtomicBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_atomic_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_atomic_buffer_binding *binding =
-         &ctx->AtomicBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj)
-         set_atomic_buffer_binding(ctx, binding, bufObj, 0, 0);
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_atomic_buffers_range(struct gl_context *ctx,
-                          GLuint first,
-                          GLsizei count,
-                          const GLuint *buffers,
-                          const GLintptr *offsets,
-                          const GLsizeiptr *sizes)
-{
-   GLint i;
-
-   if (!error_check_bind_atomic_buffers(ctx, first, count,
-                                        "glBindBuffersRange"))
+   if (!error_check_bind_atomic_buffers(ctx, first, count, caller))
      return;
 
    /* Assume that at least one binding will be changed */
@@ -3700,45 +3636,51 @@ bind_atomic_buffers_range(struct gl_context *ctx,
       struct gl_atomic_buffer_binding *binding =
          &ctx->AtomicBufferBindings[first + i];
       struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
 
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
 
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Atomic counter array bindings (see sec. 7.7.2)                │
-       *      ├───────────────────────┬───────────────────────────────────────┤
-       *      │    ...                │    ...                                │
-       *      │    offset restriction │    multiple of 4                      │
-       *      │    ...                │    ...                                │
-       *      │    size restriction   │    none                               │
-       *      └───────────────────────┴───────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ATOMIC_COUNTER_SIZE - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of %d when "
-                     "target=GL_ATOMIC_COUNTER_BUFFER)",
-                     i, (int64_t) offsets[i], ATOMIC_COUNTER_SIZE);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Atomic counter array bindings (see sec. 7.7.2)                │
+          *      ├───────────────────────┬───────────────────────────────────────┤
+          *      │    ...                │    ...                                │
+          *      │    offset restriction │    multiple of 4                      │
+          *      │    ...                │    ...                                │
+          *      │    size restriction   │    none                               │
+          *      └───────────────────────┴───────────────────────────────────────┘"
+          */
+         if (offsets[i] & (ATOMIC_COUNTER_SIZE - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of %d when "
+                        "target=GL_ATOMIC_COUNTER_BUFFER)",
+                        i, (int64_t) offsets[i], ATOMIC_COUNTER_SIZE);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
       }
 
       if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
          bufObj = binding->BufferObject;
       else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
 
       if (bufObj)
-         set_atomic_buffer_binding(ctx, binding, bufObj, offsets[i], sizes[i]);
+         set_atomic_buffer_binding(ctx, binding, bufObj, offset, size);
    }
 
    _mesa_end_bufferobj_lookups(ctx);
@@ -3903,8 +3845,8 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
                                   "glBindBuffersRange");
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffers_range(ctx, first, count, buffers,
-                                offsets, sizes);
+      bind_atomic_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                          "glBindBuffersRange");
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersRange(target=%s)",
@@ -3938,7 +3880,8 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
                                   "glBindBuffersBase");
       return;
    case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffers_base(ctx, first, count, buffers);
+      bind_atomic_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                          "glBindBuffersBase");
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersBase(target=%s)",