From 5509d43a11d42f15c91572aaf69a0f8e1ec31c71 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 4 Apr 2016 00:31:45 -0700
Subject: [PATCH 01/72] glsl: Lower variable indexing of system value arrays
 unconditionally.

lower_variable_index_to_cond_assign() did not handle system values.
gl_SampleMaskIn[] is a system value, and also an array.  Accessing it
with a variable index would trigger an unreachable() assert.

Rather than adding a new EmitNoIndirectSystemValues flag, we simply
lower unconditionally.  There is exactly one case where this occurs,
and for all current drivers, lowering produces optimal code.  Even
for future drivers with 32x MSAA, it produces reasonable code.

Fixes Piglit's new samplemaskin-indirect test.  Also fixes many ES31-CTS
tests when OES_sample_variables is enabled.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 .../lower_variable_index_to_cond_assign.cpp   | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/compiler/glsl/lower_variable_index_to_cond_assign.cpp b/src/compiler/glsl/lower_variable_index_to_cond_assign.cpp
index 278d5450bfb..fcb12d1b77d 100644
--- a/src/compiler/glsl/lower_variable_index_to_cond_assign.cpp
+++ b/src/compiler/glsl/lower_variable_index_to_cond_assign.cpp
@@ -385,6 +385,26 @@ public:
       case ir_var_const_in:
          return this->lower_temps;
 
+      case ir_var_system_value:
+         /* There are only a few system values that have array types:
+          *
+          *    gl_TessLevelInner[]
+          *    gl_TessLevelOuter[]
+          *    gl_SampleMaskIn[]
+          *
+          * The tessellation factor arrays are lowered to vec4/vec2s
+          * by lower_tess_level() before this pass occurs, so we'll
+          * never see them here.
+          *
+          * The only remaining case is gl_SampleMaskIn[], which has
+          * a length of ceil(ctx->Const.MaxSamples / 32).  Most hardware
+          * supports no more than 32 samples, in which case our lowering
+          * produces a single read of gl_SampleMaskIn[0].  Even with 64x
+          * MSAA, the array length is only 2, so the lowering is fairly
+          * efficient.  Therefore, lower unconditionally.
+          */
+         return true;
+
       case ir_var_shader_in:
          /* The input array size is unknown at compiler time for non-patch
           * inputs in TCS and TES. The arrays are sized to

From 9486614938035f6bec746d207a9cf79a7def0724 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 30 Mar 2016 12:00:02 -0700
Subject: [PATCH 02/72] i965: Make bblock_t::next and friends return NULL at
 sentinels.

The bblock_t::prev/prev_const/next/next_const API returns bblock_t
pointers, rather than exec_nodes.  So it's a bit surprising.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_cfg.h      | 12 ++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index 405020b77e5..5b770aa7af1 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -121,24 +121,36 @@ bblock_end_const(const struct bblock_t *block)
 static inline struct bblock_t *
 bblock_next(struct bblock_t *block)
 {
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
    return (struct bblock_t *)block->link.next;
 }
 
 static inline const struct bblock_t *
 bblock_next_const(const struct bblock_t *block)
 {
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
    return (const struct bblock_t *)block->link.next;
 }
 
 static inline struct bblock_t *
 bblock_prev(struct bblock_t *block)
 {
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
    return (struct bblock_t *)block->link.prev;
 }
 
 static inline const struct bblock_t *
 bblock_prev_const(const struct bblock_t *block)
 {
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
    return (const struct bblock_t *)block->link.prev;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 736deb443dd..376cb258232 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -938,7 +938,7 @@ static void
 adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
 {
    for (bblock_t *block_iter = start_block->next();
-        !block_iter->link.is_tail_sentinel();
+        block_iter;
         block_iter = block_iter->next()) {
       block_iter->start_ip += ip_adjustment;
       block_iter->end_ip += ip_adjustment;

From da5d08707bf07c76b6a1851f3a36bb7c1f8d4d4b Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 30 Mar 2016 12:00:02 -0700
Subject: [PATCH 03/72] i965: Fix invalid pointer read in
 dead_control_flow_eliminate().

There may not be a previous block.  In this case, there's no real work
to do, so just continue on to the next one.

v2: Update for bblock->prev() API change.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
index 2c1abaf255c..114dc6cb212 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
@@ -42,6 +42,10 @@ dead_control_flow_eliminate(backend_shader *s)
 
    foreach_block_safe (block, s->cfg) {
       bblock_t *prev_block = block->prev();
+
+      if (!prev_block)
+         continue;
+
       backend_instruction *const inst = block->start();
       backend_instruction *const prev_inst = prev_block->end();
 

From 3babb7b0a4037c4ae98078611a827af6dd3e121e Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 3 Apr 2016 19:51:22 -0700
Subject: [PATCH 04/72] nir: Use PRIi64 and PRIu64 instead of %ld and %lu.

%ld and %lu aren't the right format specifiers for int64_t and uint64_t
on 32-bit (x86) systems.  They're %zu on Linux and %Iu on Windows.

Use the standard C99 macros in hopes that they work everywhere.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/compiler/nir/nir_search.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index b915101ce32..3a65ab18928 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -25,6 +25,7 @@
  *
  */
 
+#include <inttypes.h>
 #include "nir_search.h"
 
 struct match_state {
@@ -494,7 +495,7 @@ construct_value(const nir_search_value *value,
          break;
 
       case nir_type_int:
-         load->def.name = ralloc_asprintf(load, "%ld", c->data.i);
+         load->def.name = ralloc_asprintf(load, "%" PRIi64, c->data.i);
          switch (bitsize->dest_size) {
          case 32:
             load->value.i32[0] = c->data.i;
@@ -508,7 +509,7 @@ construct_value(const nir_search_value *value,
          break;
 
       case nir_type_uint:
-         load->def.name = ralloc_asprintf(load, "%lu", c->data.u);
+         load->def.name = ralloc_asprintf(load, "%" PRIu64, c->data.u);
          switch (bitsize->dest_size) {
          case 32:
             load->value.u32[0] = c->data.u;

From 80c72a8ea7b1018661da0e6509a7f88ca1f5086f Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 14:02:50 -0700
Subject: [PATCH 05/72] i965/nir: Provide a default LOD for buffer textures

Our hardware requires an LOD for all texelFetch commands even if they are
on buffer textures.  GLSL IR gives us an LOD of 0 in that case, but the LOD
is really rather meaningless.  This commit allows other NIR producers to be
more lazy and not provide one at all.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 4 ++++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 5cca91ec5b4..b804f3c953f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -3022,6 +3022,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 
    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
 
+   /* The hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
    for (unsigned i = 0; i < instr->num_srcs; i++) {
       fs_reg src = get_nir_src(instr->src[i].src);
       switch (instr->src[i].src_type) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 6c8fd06fb5e..d9f96c58379 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1700,6 +1700,10 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
                                  nir_tex_instr_dest_size(instr));
    dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
 
+   /* The hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
    /* Load the texture operation sources */
    uint32_t constant_offset = 0;
    for (unsigned i = 0; i < instr->num_srcs; i++) {

From 5ea3647f89abccea5496824815b5b729f38f7a23 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 11:19:53 -0700
Subject: [PATCH 06/72] i965/fs: Move the code for load/store_shared to
 emit_cs_intrinsic

They are compute-shader only and that's where the code for doing atomics on
shared variables lives so it seemes to make sense.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 152 +++++++++++------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index b804f3c953f..90b878913b3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -2368,6 +2368,82 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
       break;
 
+   case nir_intrinsic_load_shared: {
+      assert(devinfo->gen >= 7);
+
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Get the offset to read from */
+      fs_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+      } else {
+         offset_reg = vgrf(glsl_type::uint_type);
+         bld.ADD(offset_reg,
+                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(instr->const_index[0]));
+      }
+
+      /* Read the vector */
+      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                             1 /* dims */,
+                                             instr->num_components,
+                                             BRW_PREDICATE_NONE);
+      read_result.type = dest.type;
+      for (int i = 0; i < instr->num_components; i++)
+         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+
+      break;
+   }
+
+   case nir_intrinsic_store_shared: {
+      assert(devinfo->gen >= 7);
+
+      /* Block index */
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Value */
+      fs_reg val_reg = get_nir_src(instr->src[0]);
+
+      /* Writemask */
+      unsigned writemask = instr->const_index[1];
+
+      /* Combine groups of consecutive enabled channels in one write
+       * message. We use ffs to find the first enabled channel and then ffs on
+       * the bit-inverse, down-shifted writemask to determine the length of
+       * the block of enabled bits.
+       */
+      while (writemask) {
+         unsigned first_component = ffs(writemask) - 1;
+         unsigned length = ffs(~(writemask >> first_component)) - 1;
+         fs_reg offset_reg;
+
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+         if (const_offset) {
+            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
+                                    4 * first_component);
+         } else {
+            offset_reg = vgrf(glsl_type::uint_type);
+            bld.ADD(offset_reg,
+                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
+                    brw_imm_ud(instr->const_index[0] + 4 * first_component));
+         }
+
+         emit_untyped_write(bld, surf_index, offset_reg,
+                            offset(val_reg, bld, first_component),
+                            1 /* dims */, length,
+                            BRW_PREDICATE_NONE);
+
+         /* Clear the bits in the writemask that we just wrote, then try
+          * again to see if more channels are left.
+          */
+         writemask &= (15 << (first_component + length));
+      }
+
+      break;
+   }
+
    default:
       nir_emit_intrinsic(bld, instr);
       break;
@@ -2691,82 +2767,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_shared: {
-      assert(devinfo->gen >= 7);
-
-      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
-      /* Get the offset to read from */
-      fs_reg offset_reg;
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-      if (const_offset) {
-         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
-      } else {
-         offset_reg = vgrf(glsl_type::uint_type);
-         bld.ADD(offset_reg,
-                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
-                 brw_imm_ud(instr->const_index[0]));
-      }
-
-      /* Read the vector */
-      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
-                                             1 /* dims */,
-                                             instr->num_components,
-                                             BRW_PREDICATE_NONE);
-      read_result.type = dest.type;
-      for (int i = 0; i < instr->num_components; i++)
-         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
-
-      break;
-   }
-
-   case nir_intrinsic_store_shared: {
-      assert(devinfo->gen >= 7);
-
-      /* Block index */
-      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
-      /* Value */
-      fs_reg val_reg = get_nir_src(instr->src[0]);
-
-      /* Writemask */
-      unsigned writemask = instr->const_index[1];
-
-      /* Combine groups of consecutive enabled channels in one write
-       * message. We use ffs to find the first enabled channel and then ffs on
-       * the bit-inverse, down-shifted writemask to determine the length of
-       * the block of enabled bits.
-       */
-      while (writemask) {
-         unsigned first_component = ffs(writemask) - 1;
-         unsigned length = ffs(~(writemask >> first_component)) - 1;
-         fs_reg offset_reg;
-
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
-         if (const_offset) {
-            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
-                                    4 * first_component);
-         } else {
-            offset_reg = vgrf(glsl_type::uint_type);
-            bld.ADD(offset_reg,
-                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
-                    brw_imm_ud(instr->const_index[0] + 4 * first_component));
-         }
-
-         emit_untyped_write(bld, surf_index, offset_reg,
-                            offset(val_reg, bld, first_component),
-                            1 /* dims */, length,
-                            BRW_PREDICATE_NONE);
-
-         /* Clear the bits in the writemask that we just wrote, then try
-          * again to see if more channels are left.
-          */
-         writemask &= (15 << (first_component + length));
-      }
-
-      break;
-   }
-
    case nir_intrinsic_load_input: {
       fs_reg src;
       if (stage == MESA_SHADER_VERTEX) {

From 70735643f4cf660dc3022f40f853a138aea738c2 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 27 Mar 2016 11:40:36 -0400
Subject: [PATCH 07/72] freedreno/ir3: encode instruction category in opc_t

Been on my TODO list for a while.  If nothing else this will make gdb
properly grok the opc_t enum.

This first step preserves ir3_instruction::category (with an added
assert that category matches what is encoded in opc_t).  Next step is
to drop the category field (and arg to ir3_instr_create()), but that
is split into next commit for bisectability and so that we can run
piglit in the intermediate state to flush out any problems.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/ir3/disasm-a3xx.c       |  78 ++---
 .../drivers/freedreno/ir3/instr-a3xx.h        | 298 +++++++++---------
 src/gallium/drivers/freedreno/ir3/ir3.c       |   1 +
 src/gallium/drivers/freedreno/ir3/ir3.h       |   4 +-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  |  12 +-
 5 files changed, 201 insertions(+), 192 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 599872470fc..e29d1568256 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -243,7 +243,7 @@ static void print_instr_cat2(instr_t *instr)
 			"?6?",
 	};
 
-	switch (cat2->opc) {
+	switch (_OPC(2, cat2->opc)) {
 	case OPC_CMPS_F:
 	case OPC_CMPS_U:
 	case OPC_CMPS_S:
@@ -274,7 +274,7 @@ static void print_instr_cat2(instr_t *instr)
 				cat2->src1_abs, false);
 	}
 
-	switch (cat2->opc) {
+	switch (_OPC(2, cat2->opc)) {
 	case OPC_ABSNEG_F:
 	case OPC_ABSNEG_S:
 	case OPC_CLZ_B:
@@ -382,34 +382,34 @@ static void print_instr_cat5(instr_t *instr)
 	static const struct {
 		bool src1, src2, samp, tex;
 	} info[0x1f] = {
-			[OPC_ISAM]     = { true,  false, true,  true,  },
-			[OPC_ISAML]    = { true,  true,  true,  true,  },
-			[OPC_ISAMM]    = { true,  false, true,  true,  },
-			[OPC_SAM]      = { true,  false, true,  true,  },
-			[OPC_SAMB]     = { true,  true,  true,  true,  },
-			[OPC_SAML]     = { true,  true,  true,  true,  },
-			[OPC_SAMGQ]    = { true,  false, true,  true,  },
-			[OPC_GETLOD]   = { true,  false, true,  true,  },
-			[OPC_CONV]     = { true,  true,  true,  true,  },
-			[OPC_CONVM]    = { true,  true,  true,  true,  },
-			[OPC_GETSIZE]  = { true,  false, false, true,  },
-			[OPC_GETBUF]   = { false, false, false, true,  },
-			[OPC_GETPOS]   = { true,  false, false, true,  },
-			[OPC_GETINFO]  = { false, false, false, true,  },
-			[OPC_DSX]      = { true,  false, false, false, },
-			[OPC_DSY]      = { true,  false, false, false, },
-			[OPC_GATHER4R] = { true,  false, true,  true,  },
-			[OPC_GATHER4G] = { true,  false, true,  true,  },
-			[OPC_GATHER4B] = { true,  false, true,  true,  },
-			[OPC_GATHER4A] = { true,  false, true,  true,  },
-			[OPC_SAMGP0]   = { true,  false, true,  true,  },
-			[OPC_SAMGP1]   = { true,  false, true,  true,  },
-			[OPC_SAMGP2]   = { true,  false, true,  true,  },
-			[OPC_SAMGP3]   = { true,  false, true,  true,  },
-			[OPC_DSXPP_1]  = { true,  false, false, false, },
-			[OPC_DSYPP_1]  = { true,  false, false, false, },
-			[OPC_RGETPOS]  = { false, false, false, false, },
-			[OPC_RGETINFO] = { false, false, false, false, },
+			[opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
+			[opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
+			[opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
+			[opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
+			[opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
+			[opc_op(OPC_DSX)]      = { true,  false, false, false, },
+			[opc_op(OPC_DSY)]      = { true,  false, false, false, },
+			[opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_RGETPOS)]  = { false, false, false, false, },
+			[opc_op(OPC_RGETINFO)] = { false, false, false, false, },
 	};
 	instr_cat5_t *cat5 = &instr->cat5;
 	int i;
@@ -423,7 +423,7 @@ static void print_instr_cat5(instr_t *instr)
 
 	printf(" ");
 
-	switch (cat5->opc) {
+	switch (_OPC(5, cat5->opc)) {
 	case OPC_DSXPP_1:
 	case OPC_DSYPP_1:
 		break;
@@ -488,7 +488,7 @@ static void print_instr_cat6(instr_t *instr)
 	memset(&src1, 0, sizeof(src1));
 	memset(&src2, 0, sizeof(src2));
 
-	switch (cat6->opc) {
+	switch (_OPC(6, cat6->opc)) {
 	case OPC_RESINFO:
 	case OPC_RESFMT:
 		dst.full  = type_size(cat6->type) == 32;
@@ -519,7 +519,7 @@ static void print_instr_cat6(instr_t *instr)
 		break;
 	}
 
-	switch (cat6->opc) {
+	switch (_OPC(6, cat6->opc)) {
 	case OPC_PREFETCH:
 	case OPC_RESINFO:
 		break;
@@ -545,7 +545,7 @@ static void print_instr_cat6(instr_t *instr)
 	}
 	printf(" ");
 
-	switch (cat6->opc) {
+	switch (_OPC(6, cat6->opc)) {
 	case OPC_STG:
 		sd = 'g';
 		break;
@@ -636,7 +636,7 @@ static void print_instr_cat6(instr_t *instr)
 	if (ss)
 		printf("]");
 
-	switch (cat6->opc) {
+	switch (_OPC(6, cat6->opc)) {
 	case OPC_RESINFO:
 	case OPC_RESFMT:
 		break;
@@ -656,7 +656,7 @@ static const struct opc_info {
 	const char *name;
 	void (*print)(instr_t *instr);
 } opcs[1 << (3+NOPC_BITS)] = {
-#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
 	/* category 0: */
 	OPC(0, OPC_NOP,          nop),
 	OPC(0, OPC_BR,           br),
@@ -672,7 +672,7 @@ static const struct opc_info {
 	OPC(0, OPC_FLOW_REV,     flow_rev),
 
 	/* category 1: */
-	OPC(1, 0, ),
+	OPC(1, OPC_MOV, ),
 
 	/* category 2: */
 	OPC(2, OPC_ADD_F,        add.f),
@@ -822,8 +822,8 @@ static const struct opc_info {
 #include "ir3.h"
 const char *ir3_instr_name(struct ir3_instruction *instr)
 {
-	if (instr->category == -1) return "??meta??";
-	return opcs[(instr->category << NOPC_BITS) | instr->opc].name;
+	if (opc_cat(instr->opc) == -1) return "??meta??";
+	return opcs[instr->opc].name;
 }
 
 static void print_instr(uint32_t *dwords, int level, int n)
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index 1b1f1f0a797..87083fd1e81 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -29,181 +29,189 @@
 #include <stdint.h>
 #include <assert.h>
 
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+
 typedef enum {
 	/* category 0: */
-	OPC_NOP = 0,
-	OPC_BR = 1,
-	OPC_JUMP = 2,
-	OPC_CALL = 3,
-	OPC_RET = 4,
-	OPC_KILL = 5,
-	OPC_END = 6,
-	OPC_EMIT = 7,
-	OPC_CUT = 8,
-	OPC_CHMASK = 9,
-	OPC_CHSH = 10,
-	OPC_FLOW_REV = 11,
+	OPC_NOP             = _OPC(0, 0),
+	OPC_BR              = _OPC(0, 1),
+	OPC_JUMP            = _OPC(0, 2),
+	OPC_CALL            = _OPC(0, 3),
+	OPC_RET             = _OPC(0, 4),
+	OPC_KILL            = _OPC(0, 5),
+	OPC_END             = _OPC(0, 6),
+	OPC_EMIT            = _OPC(0, 7),
+	OPC_CUT             = _OPC(0, 8),
+	OPC_CHMASK          = _OPC(0, 9),
+	OPC_CHSH            = _OPC(0, 10),
+	OPC_FLOW_REV        = _OPC(0, 11),
 
 	/* category 1: */
-	/* no opc.. all category 1 are variants of mov */
+	OPC_MOV             = _OPC(1, 0),
 
 	/* category 2: */
-	OPC_ADD_F = 0,
-	OPC_MIN_F = 1,
-	OPC_MAX_F = 2,
-	OPC_MUL_F = 3,
-	OPC_SIGN_F = 4,
-	OPC_CMPS_F = 5,
-	OPC_ABSNEG_F = 6,
-	OPC_CMPV_F = 7,
+	OPC_ADD_F           = _OPC(2, 0),
+	OPC_MIN_F           = _OPC(2, 1),
+	OPC_MAX_F           = _OPC(2, 2),
+	OPC_MUL_F           = _OPC(2, 3),
+	OPC_SIGN_F          = _OPC(2, 4),
+	OPC_CMPS_F          = _OPC(2, 5),
+	OPC_ABSNEG_F        = _OPC(2, 6),
+	OPC_CMPV_F          = _OPC(2, 7),
 	/* 8 - invalid */
-	OPC_FLOOR_F = 9,
-	OPC_CEIL_F = 10,
-	OPC_RNDNE_F = 11,
-	OPC_RNDAZ_F = 12,
-	OPC_TRUNC_F = 13,
+	OPC_FLOOR_F         = _OPC(2, 9),
+	OPC_CEIL_F          = _OPC(2, 10),
+	OPC_RNDNE_F         = _OPC(2, 11),
+	OPC_RNDAZ_F         = _OPC(2, 12),
+	OPC_TRUNC_F         = _OPC(2, 13),
 	/* 14-15 - invalid */
-	OPC_ADD_U = 16,
-	OPC_ADD_S = 17,
-	OPC_SUB_U = 18,
-	OPC_SUB_S = 19,
-	OPC_CMPS_U = 20,
-	OPC_CMPS_S = 21,
-	OPC_MIN_U = 22,
-	OPC_MIN_S = 23,
-	OPC_MAX_U = 24,
-	OPC_MAX_S = 25,
-	OPC_ABSNEG_S = 26,
+	OPC_ADD_U           = _OPC(2, 16),
+	OPC_ADD_S           = _OPC(2, 17),
+	OPC_SUB_U           = _OPC(2, 18),
+	OPC_SUB_S           = _OPC(2, 19),
+	OPC_CMPS_U          = _OPC(2, 20),
+	OPC_CMPS_S          = _OPC(2, 21),
+	OPC_MIN_U           = _OPC(2, 22),
+	OPC_MIN_S           = _OPC(2, 23),
+	OPC_MAX_U           = _OPC(2, 24),
+	OPC_MAX_S           = _OPC(2, 25),
+	OPC_ABSNEG_S        = _OPC(2, 26),
 	/* 27 - invalid */
-	OPC_AND_B = 28,
-	OPC_OR_B = 29,
-	OPC_NOT_B = 30,
-	OPC_XOR_B = 31,
+	OPC_AND_B           = _OPC(2, 28),
+	OPC_OR_B            = _OPC(2, 29),
+	OPC_NOT_B           = _OPC(2, 30),
+	OPC_XOR_B           = _OPC(2, 31),
 	/* 32 - invalid */
-	OPC_CMPV_U = 33,
-	OPC_CMPV_S = 34,
+	OPC_CMPV_U          = _OPC(2, 33),
+	OPC_CMPV_S          = _OPC(2, 34),
 	/* 35-47 - invalid */
-	OPC_MUL_U = 48,
-	OPC_MUL_S = 49,
-	OPC_MULL_U = 50,
-	OPC_BFREV_B = 51,
-	OPC_CLZ_S = 52,
-	OPC_CLZ_B = 53,
-	OPC_SHL_B = 54,
-	OPC_SHR_B = 55,
-	OPC_ASHR_B = 56,
-	OPC_BARY_F = 57,
-	OPC_MGEN_B = 58,
-	OPC_GETBIT_B = 59,
-	OPC_SETRM = 60,
-	OPC_CBITS_B = 61,
-	OPC_SHB = 62,
-	OPC_MSAD = 63,
+	OPC_MUL_U           = _OPC(2, 48),
+	OPC_MUL_S           = _OPC(2, 49),
+	OPC_MULL_U          = _OPC(2, 50),
+	OPC_BFREV_B         = _OPC(2, 51),
+	OPC_CLZ_S           = _OPC(2, 52),
+	OPC_CLZ_B           = _OPC(2, 53),
+	OPC_SHL_B           = _OPC(2, 54),
+	OPC_SHR_B           = _OPC(2, 55),
+	OPC_ASHR_B          = _OPC(2, 56),
+	OPC_BARY_F          = _OPC(2, 57),
+	OPC_MGEN_B          = _OPC(2, 58),
+	OPC_GETBIT_B        = _OPC(2, 59),
+	OPC_SETRM           = _OPC(2, 60),
+	OPC_CBITS_B         = _OPC(2, 61),
+	OPC_SHB             = _OPC(2, 62),
+	OPC_MSAD            = _OPC(2, 63),
 
 	/* category 3: */
-	OPC_MAD_U16 = 0,
-	OPC_MADSH_U16 = 1,
-	OPC_MAD_S16 = 2,
-	OPC_MADSH_M16 = 3,   /* should this be .s16? */
-	OPC_MAD_U24 = 4,
-	OPC_MAD_S24 = 5,
-	OPC_MAD_F16 = 6,
-	OPC_MAD_F32 = 7,
-	OPC_SEL_B16 = 8,
-	OPC_SEL_B32 = 9,
-	OPC_SEL_S16 = 10,
-	OPC_SEL_S32 = 11,
-	OPC_SEL_F16 = 12,
-	OPC_SEL_F32 = 13,
-	OPC_SAD_S16 = 14,
-	OPC_SAD_S32 = 15,
+	OPC_MAD_U16         = _OPC(3, 0),
+	OPC_MADSH_U16       = _OPC(3, 1),
+	OPC_MAD_S16         = _OPC(3, 2),
+	OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
+	OPC_MAD_U24         = _OPC(3, 4),
+	OPC_MAD_S24         = _OPC(3, 5),
+	OPC_MAD_F16         = _OPC(3, 6),
+	OPC_MAD_F32         = _OPC(3, 7),
+	OPC_SEL_B16         = _OPC(3, 8),
+	OPC_SEL_B32         = _OPC(3, 9),
+	OPC_SEL_S16         = _OPC(3, 10),
+	OPC_SEL_S32         = _OPC(3, 11),
+	OPC_SEL_F16         = _OPC(3, 12),
+	OPC_SEL_F32         = _OPC(3, 13),
+	OPC_SAD_S16         = _OPC(3, 14),
+	OPC_SAD_S32         = _OPC(3, 15),
 
 	/* category 4: */
-	OPC_RCP = 0,
-	OPC_RSQ = 1,
-	OPC_LOG2 = 2,
-	OPC_EXP2 = 3,
-	OPC_SIN = 4,
-	OPC_COS = 5,
-	OPC_SQRT = 6,
+	OPC_RCP             = _OPC(4, 0),
+	OPC_RSQ             = _OPC(4, 1),
+	OPC_LOG2            = _OPC(4, 2),
+	OPC_EXP2            = _OPC(4, 3),
+	OPC_SIN             = _OPC(4, 4),
+	OPC_COS             = _OPC(4, 5),
+	OPC_SQRT            = _OPC(4, 6),
 	// 7-63 - invalid
 
 	/* category 5: */
-	OPC_ISAM = 0,
-	OPC_ISAML = 1,
-	OPC_ISAMM = 2,
-	OPC_SAM = 3,
-	OPC_SAMB = 4,
-	OPC_SAML = 5,
-	OPC_SAMGQ = 6,
-	OPC_GETLOD = 7,
-	OPC_CONV = 8,
-	OPC_CONVM = 9,
-	OPC_GETSIZE = 10,
-	OPC_GETBUF = 11,
-	OPC_GETPOS = 12,
-	OPC_GETINFO = 13,
-	OPC_DSX = 14,
-	OPC_DSY = 15,
-	OPC_GATHER4R = 16,
-	OPC_GATHER4G = 17,
-	OPC_GATHER4B = 18,
-	OPC_GATHER4A = 19,
-	OPC_SAMGP0 = 20,
-	OPC_SAMGP1 = 21,
-	OPC_SAMGP2 = 22,
-	OPC_SAMGP3 = 23,
-	OPC_DSXPP_1 = 24,
-	OPC_DSYPP_1 = 25,
-	OPC_RGETPOS = 26,
-	OPC_RGETINFO = 27,
+	OPC_ISAM            = _OPC(5, 0),
+	OPC_ISAML           = _OPC(5, 1),
+	OPC_ISAMM           = _OPC(5, 2),
+	OPC_SAM             = _OPC(5, 3),
+	OPC_SAMB            = _OPC(5, 4),
+	OPC_SAML            = _OPC(5, 5),
+	OPC_SAMGQ           = _OPC(5, 6),
+	OPC_GETLOD          = _OPC(5, 7),
+	OPC_CONV            = _OPC(5, 8),
+	OPC_CONVM           = _OPC(5, 9),
+	OPC_GETSIZE         = _OPC(5, 10),
+	OPC_GETBUF          = _OPC(5, 11),
+	OPC_GETPOS          = _OPC(5, 12),
+	OPC_GETINFO         = _OPC(5, 13),
+	OPC_DSX             = _OPC(5, 14),
+	OPC_DSY             = _OPC(5, 15),
+	OPC_GATHER4R        = _OPC(5, 16),
+	OPC_GATHER4G        = _OPC(5, 17),
+	OPC_GATHER4B        = _OPC(5, 18),
+	OPC_GATHER4A        = _OPC(5, 19),
+	OPC_SAMGP0          = _OPC(5, 20),
+	OPC_SAMGP1          = _OPC(5, 21),
+	OPC_SAMGP2          = _OPC(5, 22),
+	OPC_SAMGP3          = _OPC(5, 23),
+	OPC_DSXPP_1         = _OPC(5, 24),
+	OPC_DSYPP_1         = _OPC(5, 25),
+	OPC_RGETPOS         = _OPC(5, 26),
+	OPC_RGETINFO        = _OPC(5, 27),
 
 	/* category 6: */
-	OPC_LDG = 0,        /* load-global */
-	OPC_LDL = 1,
-	OPC_LDP = 2,
-	OPC_STG = 3,        /* store-global */
-	OPC_STL = 4,
-	OPC_STP = 5,
-	OPC_STI = 6,
-	OPC_G2L = 7,
-	OPC_L2G = 8,
-	OPC_PREFETCH = 9,
-	OPC_LDLW = 10,
-	OPC_STLW = 11,
-	OPC_RESFMT = 14,
-	OPC_RESINFO = 15,
-	OPC_ATOMIC_ADD = 16,
-	OPC_ATOMIC_SUB = 17,
-	OPC_ATOMIC_XCHG = 18,
-	OPC_ATOMIC_INC = 19,
-	OPC_ATOMIC_DEC = 20,
-	OPC_ATOMIC_CMPXCHG = 21,
-	OPC_ATOMIC_MIN = 22,
-	OPC_ATOMIC_MAX = 23,
-	OPC_ATOMIC_AND = 24,
-	OPC_ATOMIC_OR = 25,
-	OPC_ATOMIC_XOR = 26,
-	OPC_LDGB_TYPED_4D = 27,
-	OPC_STGB_4D_4 = 28,
-	OPC_STIB = 29,
-	OPC_LDC_4 = 30,
-	OPC_LDLV = 31,
+	OPC_LDG             = _OPC(6, 0),        /* load-global */
+	OPC_LDL             = _OPC(6, 1),
+	OPC_LDP             = _OPC(6, 2),
+	OPC_STG             = _OPC(6, 3),        /* store-global */
+	OPC_STL             = _OPC(6, 4),
+	OPC_STP             = _OPC(6, 5),
+	OPC_STI             = _OPC(6, 6),
+	OPC_G2L             = _OPC(6, 7),
+	OPC_L2G             = _OPC(6, 8),
+	OPC_PREFETCH        = _OPC(6, 9),
+	OPC_LDLW            = _OPC(6, 10),
+	OPC_STLW            = _OPC(6, 11),
+	OPC_RESFMT          = _OPC(6, 14),
+	OPC_RESINFO         = _OPC(6, 15),
+	OPC_ATOMIC_ADD      = _OPC(6, 16),
+	OPC_ATOMIC_SUB      = _OPC(6, 17),
+	OPC_ATOMIC_XCHG     = _OPC(6, 18),
+	OPC_ATOMIC_INC      = _OPC(6, 19),
+	OPC_ATOMIC_DEC      = _OPC(6, 20),
+	OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
+	OPC_ATOMIC_MIN      = _OPC(6, 22),
+	OPC_ATOMIC_MAX      = _OPC(6, 23),
+	OPC_ATOMIC_AND      = _OPC(6, 24),
+	OPC_ATOMIC_OR       = _OPC(6, 25),
+	OPC_ATOMIC_XOR      = _OPC(6, 26),
+	OPC_LDGB_TYPED_4D   = _OPC(6, 27),
+	OPC_STGB_4D_4       = _OPC(6, 28),
+	OPC_STIB            = _OPC(6, 29),
+	OPC_LDC_4           = _OPC(6, 30),
+	OPC_LDLV            = _OPC(6, 31),
 
 	/* meta instructions (category -1): */
 	/* placeholder instr to mark shader inputs: */
-	OPC_META_INPUT = 0,
-	OPC_META_PHI = 1,
+	OPC_META_INPUT      = _OPC(-1, 0),
+	OPC_META_PHI        = _OPC(-1, 1),
 	/* The "fan-in" and "fan-out" instructions are used for keeping
 	 * track of instructions that write to multiple dst registers
 	 * (fan-out) like texture sample instructions, or read multiple
 	 * consecutive scalar registers (fan-in) (bary.f, texture samp)
 	 */
-	OPC_META_FO = 2,
-	OPC_META_FI = 3,
+	OPC_META_FO         = _OPC(-1, 2),
+	OPC_META_FI         = _OPC(-1, 3),
 
 } opc_t;
 
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
 typedef enum {
 	TYPE_F16 = 0,
 	TYPE_F32 = 1,
@@ -472,7 +480,7 @@ typedef struct PACKED {
 
 static inline bool instr_cat3_full(instr_cat3_t *cat3)
 {
-	switch (cat3->opc) {
+	switch (_OPC(3, cat3->opc)) {
 	case OPC_MAD_F16:
 	case OPC_MAD_U16:
 	case OPC_MAD_S16:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 7d89142d7a1..d86dfca9d54 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -688,6 +688,7 @@ struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
 	struct ir3_instruction *instr = instr_create(block, nreg);
 	instr->block = block;
 	instr->category = category;
+	debug_assert(opc_cat(opc) == category);
 	instr->opc = opc;
 	insert_instr(block, instr);
 	return instr;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 1a109d880e6..c9a9d29faf7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -902,7 +902,7 @@ static inline struct ir3_instruction *
 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
 {
 	struct ir3_instruction *instr =
-		ir3_instr_create(block, 1, 0);
+		ir3_instr_create(block, 1, OPC_MOV);
 	ir3_reg_create(instr, 0, 0);   /* dst */
 	if (src->regs[0]->flags & IR3_REG_ARRAY) {
 		struct ir3_register *src_reg =
@@ -923,7 +923,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 		type_t src_type, type_t dst_type)
 {
 	struct ir3_instruction *instr =
-		ir3_instr_create(block, 1, 0);
+		ir3_instr_create(block, 1, OPC_MOV);
 	ir3_reg_create(instr, 0, 0);   /* dst */
 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 	instr->cat1.src_type = src_type;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 3d656d4a34d..fafd6beea4b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -286,7 +286,7 @@ create_immed(struct ir3_block *block, uint32_t val)
 {
 	struct ir3_instruction *mov;
 
-	mov = ir3_instr_create(block, 1, 0);
+	mov = ir3_instr_create(block, 1, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -366,7 +366,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n)
 {
 	struct ir3_instruction *mov;
 
-	mov = ir3_instr_create(ctx->block, 1, 0);
+	mov = ir3_instr_create(ctx->block, 1, OPC_MOV);
 	/* TODO get types right? */
 	mov->cat1.src_type = TYPE_F32;
 	mov->cat1.dst_type = TYPE_F32;
@@ -382,7 +382,7 @@ create_uniform_indirect(struct ir3_compile *ctx, int n,
 {
 	struct ir3_instruction *mov;
 
-	mov = ir3_instr_create(ctx->block, 1, 0);
+	mov = ir3_instr_create(ctx->block, 1, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -418,7 +418,7 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
 	struct ir3_instruction *mov;
 	struct ir3_register *src;
 
-	mov = ir3_instr_create(block, 1, 0);
+	mov = ir3_instr_create(block, 1, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -441,7 +441,7 @@ create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
 	struct ir3_instruction *mov;
 	struct ir3_register *src;
 
-	mov = ir3_instr_create(block, 1, 0);
+	mov = ir3_instr_create(block, 1, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -469,7 +469,7 @@ create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
 	struct ir3_instruction *mov;
 	struct ir3_register *dst;
 
-	mov = ir3_instr_create(block, 1, 0);
+	mov = ir3_instr_create(block, 1, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |

From 19739e4fb9024f42a8fc332e6fa94c292bb6bc16 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 27 Mar 2016 13:43:45 -0400
Subject: [PATCH 08/72] freedreno/ir3: remove ir3_instruction::category

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c       |  3 +-
 src/gallium/drivers/freedreno/ir3/ir3.h       | 93 +++++++++----------
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  |  6 +-
 src/gallium/drivers/freedreno/ir3/ir3_cp.c    | 32 +++----
 src/gallium/drivers/freedreno/ir3/ir3_depth.c |  3 +-
 src/gallium/drivers/freedreno/ir3/ir3_group.c |  7 +-
 .../drivers/freedreno/ir3/ir3_legalize.c      |  4 +-
 src/gallium/drivers/freedreno/ir3/ir3_print.c | 10 +-
 src/gallium/drivers/freedreno/ir3/ir3_ra.c    | 14 +--
 src/gallium/drivers/freedreno/ir3/ir3_sched.c |  5 +-
 10 files changed, 84 insertions(+), 93 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index d86dfca9d54..2e7f2008067 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -612,7 +612,7 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
 
 	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
 		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			int ret = emit[instr->category](instr, dwords, info);
+			int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
 			if (ret)
 				goto fail;
 			info->instrs_count += 1 + instr->repeat;
@@ -687,7 +687,6 @@ struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
 {
 	struct ir3_instruction *instr = instr_create(block, nreg);
 	instr->block = block;
-	instr->category = category;
 	debug_assert(opc_cat(opc) == category);
 	instr->opc = opc;
 	insert_instr(block, instr);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index c9a9d29faf7..1391cbd97da 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -130,7 +130,6 @@ struct ir3_register {
 
 struct ir3_instruction {
 	struct ir3_block *block;
-	int category;
 	opc_t opc;
 	enum {
 		/* (sy) flag is set on first instruction, and after sample
@@ -508,17 +507,17 @@ static inline uint32_t reg_comp(struct ir3_register *reg)
 
 static inline bool is_flow(struct ir3_instruction *instr)
 {
-	return (instr->category == 0);
+	return (opc_cat(instr->opc) == 0);
 }
 
 static inline bool is_kill(struct ir3_instruction *instr)
 {
-	return is_flow(instr) && (instr->opc == OPC_KILL);
+	return instr->opc == OPC_KILL;
 }
 
 static inline bool is_nop(struct ir3_instruction *instr)
 {
-	return is_flow(instr) && (instr->opc == OPC_NOP);
+	return instr->opc == OPC_NOP;
 }
 
 /* Is it a non-transformative (ie. not type changing) mov?  This can
@@ -538,75 +537,71 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
 	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
 		return false;
 
-	if ((instr->category == 1) &&
-			(instr->cat1.src_type == instr->cat1.dst_type))
+	switch (instr->opc) {
+	case OPC_MOV:
+		return instr->cat1.src_type == instr->cat1.dst_type;
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
 		return true;
-	if ((instr->category == 2) && ((instr->opc == OPC_ABSNEG_F) ||
-			(instr->opc == OPC_ABSNEG_S)))
-		return true;
-	return false;
+	default:
+		return false;
+	}
 }
 
 static inline bool is_alu(struct ir3_instruction *instr)
 {
-	return (1 <= instr->category) && (instr->category <= 3);
+	return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
 }
 
 static inline bool is_sfu(struct ir3_instruction *instr)
 {
-	return (instr->category == 4);
+	return (opc_cat(instr->opc) == 4);
 }
 
 static inline bool is_tex(struct ir3_instruction *instr)
 {
-	return (instr->category == 5);
+	return (opc_cat(instr->opc) == 5);
 }
 
 static inline bool is_mem(struct ir3_instruction *instr)
 {
-	return (instr->category == 6);
+	return (opc_cat(instr->opc) == 6);
 }
 
 static inline bool
 is_store(struct ir3_instruction *instr)
 {
-	if (is_mem(instr)) {
-		/* these instructions, the "destination" register is
-		 * actually a source, the address to store to.
-		 */
-		switch (instr->opc) {
-		case OPC_STG:
-		case OPC_STP:
-		case OPC_STL:
-		case OPC_STLW:
-		case OPC_L2G:
-		case OPC_G2L:
-			return true;
-		default:
-			break;
-		}
+	/* these instructions, the "destination" register is
+	 * actually a source, the address to store to.
+	 */
+	switch (instr->opc) {
+	case OPC_STG:
+	case OPC_STP:
+	case OPC_STL:
+	case OPC_STLW:
+	case OPC_L2G:
+	case OPC_G2L:
+		return true;
+	default:
+		return false;
 	}
-	return false;
 }
 
 static inline bool is_load(struct ir3_instruction *instr)
 {
-	if (is_mem(instr)) {
-		switch (instr->opc) {
-		case OPC_LDG:
-		case OPC_LDL:
-		case OPC_LDP:
-		case OPC_L2G:
-		case OPC_LDLW:
-		case OPC_LDC_4:
-		case OPC_LDLV:
+	switch (instr->opc) {
+	case OPC_LDG:
+	case OPC_LDL:
+	case OPC_LDP:
+	case OPC_L2G:
+	case OPC_LDLW:
+	case OPC_LDC_4:
+	case OPC_LDLV:
 		/* probably some others too.. */
-			return true;
-		default:
-			break;
-		}
+		return true;
+	default:
+		return false;
 	}
-	return false;
 }
 
 static inline bool is_input(struct ir3_instruction *instr)
@@ -615,9 +610,13 @@ static inline bool is_input(struct ir3_instruction *instr)
 	 * interpolation.. fortunately inloc is the first src
 	 * register in either case
 	 */
-	if (is_mem(instr) && (instr->opc == OPC_LDLV))
+	switch (instr->opc) {
+	case OPC_LDLV:
+	case OPC_BARY_F:
 		return true;
-	return (instr->category == 2) && (instr->opc == OPC_BARY_F);
+	default:
+		return false;
+	}
 }
 
 static inline bool is_meta(struct ir3_instruction *instr)
@@ -626,7 +625,7 @@ static inline bool is_meta(struct ir3_instruction *instr)
 	 * might actually contribute some instructions to the final
 	 * result?
 	 */
-	return (instr->category == -1);
+	return (opc_cat(instr->opc) == -1);
 }
 
 static inline bool writes_addr(struct ir3_instruction *instr)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index fafd6beea4b..1f043e5d9d6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1651,7 +1651,7 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
 		nir_phi_instr *nphi;
 
 		/* phi's only come at start of block: */
-		if (!(is_meta(instr) && (instr->opc == OPC_META_PHI)))
+		if (instr->opc != OPC_META_PHI)
 			break;
 
 		if (!instr->phi.nphi)
@@ -2323,12 +2323,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			 * in which case we need to propagate the half-reg flag
 			 * up to the definer so that RA sees it:
 			 */
-			if (is_meta(out) && (out->opc == OPC_META_FO)) {
+			if (out->opc == OPC_META_FO) {
 				out = out->regs[1]->instr;
 				out->regs[0]->flags |= IR3_REG_HALF;
 			}
 
-			if (out->category == 1) {
+			if (out->opc == OPC_MOV) {
 				out->cat1.dst_type = half_type(out->cat1.dst_type);
 			}
 		}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 1cc211a7663..48870074514 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -58,14 +58,14 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 				return false;
 
 		/* TODO: remove this hack: */
-		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
+		if (src_instr->opc == OPC_META_FO)
 			return false;
 		/* TODO: we currently don't handle left/right neighbors
 		 * very well when inserting parallel-copies into phi..
 		 * to avoid problems don't eliminate a mov coming out
 		 * of phi..
 		 */
-		if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI))
+		if (src_instr->opc == OPC_META_PHI)
 			return false;
 		return true;
 	}
@@ -96,7 +96,7 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 		return false;
 
 	/* clear flags that are 'ok' */
-	switch (instr->category) {
+	switch (opc_cat(instr->opc)) {
 	case 1:
 		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
 		if (flags & ~valid_flags)
@@ -205,15 +205,6 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
 	*dstflags |= srcflags & IR3_REG_ARRAY;
 }
 
-/* the "plain" MAD's (ie. the ones that don't shift first src prior to
- * multiply) can swap their first two srcs if src[0] is !CONST and
- * src[1] is CONST:
- */
-static bool is_valid_mad(struct ir3_instruction *instr)
-{
-	return (instr->category == 3) && is_mad(instr->opc);
-}
-
 /**
  * Handle cp for a given src register.  This additionally handles
  * the cases of collapsing immedate/const (which replace the src
@@ -257,8 +248,12 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 		if (!valid_flags(instr, n, new_flags)) {
 			/* special case for "normal" mad instructions, we can
 			 * try swapping the first two args if that fits better.
+			 *
+			 * the "plain" MAD's (ie. the ones that don't shift first
+			 * src prior to multiply) can swap their first two srcs if
+			 * src[0] is !CONST and src[1] is CONST:
 			 */
-			if ((n == 1) && is_valid_mad(instr) &&
+			if ((n == 1) && is_mad(instr->opc) &&
 					!(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
 					valid_flags(instr, 0, new_flags)) {
 				/* swap src[0] and src[1]: */
@@ -292,7 +287,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			 * just somehow don't work out.  This restriction may only
 			 * apply if the first src is also CONST.
 			 */
-			if ((instr->category == 3) && (n == 2) &&
+			if ((opc_cat(instr->opc) == 3) && (n == 2) &&
 					(src_reg->flags & IR3_REG_RELATIV) &&
 					(src_reg->array.offset == 0))
 				return;
@@ -328,10 +323,9 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 		if (src_reg->flags & IR3_REG_IMMED) {
 			int32_t iim_val = src_reg->iim_val;
 
-			debug_assert((instr->category == 1) ||
-					(instr->category == 6) ||
-					((instr->category == 2) &&
-						ir3_cat2_int(instr->opc)));
+			debug_assert((opc_cat(instr->opc) == 1) ||
+					(opc_cat(instr->opc) == 6) ||
+					ir3_cat2_int(instr->opc));
 
 			if (new_flags & IR3_REG_SABS)
 				iim_val = abs(iim_val);
@@ -343,7 +337,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 				iim_val = ~iim_val;
 
 			/* other than category 1 (mov) we can only encode up to 10 bits: */
-			if ((instr->category == 1) || !(iim_val & ~0x3ff)) {
+			if ((instr->opc == OPC_MOV) || !(iim_val & ~0x3ff)) {
 				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
 				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 				src_reg->flags = new_flags;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 6d294f1a48c..c3f6de965ce 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -74,8 +74,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 			is_mem(consumer)) {
 		return 6;
-	} else if ((consumer->category == 3) &&
-			(is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index ca28aefd502..70212968d89 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -69,8 +69,7 @@ static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
 
 	/* create src reg for meta:in and fixup to now be a mov: */
 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
-	instr->category = 1;
-	instr->opc = 0;
+	instr->opc = OPC_MOV;
 	instr->cat1.src_type = TYPE_F32;
 	instr->cat1.dst_type = TYPE_F32;
 
@@ -117,7 +116,7 @@ restart:
 				conflicts(instr->cp.right, right);
 
 			/* RA can't yet deal very well w/ group'd phi's: */
-			if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+			if (instr->opc == OPC_META_PHI)
 				conflict = true;
 
 			/* we also can't have an instr twice in the group: */
@@ -168,7 +167,7 @@ instr_find_neighbors(struct ir3_instruction *instr)
 	if (ir3_instr_check_mark(instr))
 		return;
 
-	if (is_meta(instr) && (instr->opc == OPC_META_FI))
+	if (instr->opc == OPC_META_FI)
 		group_n(&instr_ops, instr, instr->regs_count - 1);
 
 	foreach_ssa_src(src, instr)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index e94293f6d6b..613588d276f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -146,7 +146,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		 * clever if we were aware of this during scheduling, but
 		 * this should be a pretty rare case:
 		 */
-		if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
+		if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
 			struct ir3_instruction *nop;
 			nop = ir3_NOP(block);
 			nop->flags |= IR3_INSTR_SS;
@@ -154,7 +154,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		}
 
 		/* need to be able to set (ss) on first instruction: */
-		if (list_empty(&block->instr_list) && (n->category >= 5))
+		if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
 			ir3_NOP(block);
 
 		if (is_nop(n) && !list_empty(&block->instr_list)) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index ba0c4a57aa3..86418b89820 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -35,6 +35,8 @@
 
 static void print_instr_name(struct ir3_instruction *instr)
 {
+	if (!instr)
+		return;
 #ifdef DEBUG
 	printf("%04u:", instr->serialno);
 #endif
@@ -61,7 +63,7 @@ static void print_instr_name(struct ir3_instruction *instr)
 			}
 			break;
 		}
-	} else if (instr->category == 1) {
+	} else if (instr->opc == OPC_MOV) {
 		static const char *type[] = {
 				[TYPE_F16] = "f16",
 				[TYPE_F32] = "f32",
@@ -191,10 +193,8 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_FO) {
-			printf(", off=%d", instr->fo.off);
-		}
+	if (instr->opc == OPC_META_FO) {
+		printf(", off=%d", instr->fo.off);
 	}
 
 	if (is_flow(instr) && instr->cat0.target) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index bcad96e8a30..2f369e677e0 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -342,7 +342,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 		return id->defn;
 	}
 
-	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
+	if (instr->opc == OPC_META_FI) {
 		/* What about the case where collect is subset of array, we
 		 * need to find the distance between where actual array starts
 		 * and fanin..  that probably doesn't happen currently.
@@ -436,7 +436,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 		}
 	}
 
-	if (is_meta(d) && (d->opc == OPC_META_PHI)) {
+	if (d->opc == OPC_META_PHI) {
 		/* we have already inserted parallel-copies into
 		 * the phi, so we don't need to chase definers
 		 */
@@ -456,7 +456,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 		d = dd;
 	}
 
-	if (is_meta(d) && (d->opc == OPC_META_FO)) {
+	if (d->opc == OPC_META_FO) {
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
@@ -869,7 +869,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 /* some instructions need fix-up if dst register is half precision: */
 static void fixup_half_instr_dst(struct ir3_instruction *instr)
 {
-	switch (instr->category) {
+	switch (opc_cat(instr->opc)) {
 	case 1: /* move instructions */
 		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
 		break;
@@ -910,10 +910,12 @@ static void fixup_half_instr_dst(struct ir3_instruction *instr)
 /* some instructions need fix-up if src register is half precision: */
 static void fixup_half_instr_src(struct ir3_instruction *instr)
 {
-	switch (instr->category) {
-	case 1: /* move instructions */
+	switch (instr->opc) {
+	case OPC_MOV:
 		instr->cat1.src_type = half_type(instr->cat1.src_type);
 		break;
+	default:
+		break;
 	}
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 8f640febc5d..9be5ca34ccd 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -511,8 +511,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	 * occupied), and move remaining to depth sorted list:
 	 */
 	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
-				(instr->opc == OPC_META_PHI))) {
+		if ((instr->opc == OPC_META_INPUT) || (instr->opc == OPC_META_PHI)) {
 			schedule(ctx, instr);
 		} else {
 			ir3_insert_by_depth(instr, &ctx->depth_list);
@@ -627,7 +626,7 @@ static void
 sched_insert_parallel_copies(struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+		if (instr->opc == OPC_META_PHI) {
 			struct ir3_register *reg;
 			foreach_src(reg, instr) {
 				struct ir3_instruction *src = reg->instr;

From 38ae05a340bdf526d5da62159223ad9938fea36a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Apr 2016 17:54:39 -0400
Subject: [PATCH 09/72] freedreno/ir3: drop unused instr category arg

No longer used, so drop the extra arg to ir3_instr_create()

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c       |   8 +-
 src/gallium/drivers/freedreno/ir3/ir3.h       | 187 +++++++++---------
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  |  23 ++-
 src/gallium/drivers/freedreno/ir3/ir3_group.c |   2 +-
 .../drivers/freedreno/ir3/ir3_legalize.c      |   2 +-
 5 files changed, 108 insertions(+), 114 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 2e7f2008067..3de8fdc11b3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -683,23 +683,21 @@ static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
 }
 
 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-		int category, opc_t opc, int nreg)
+		opc_t opc, int nreg)
 {
 	struct ir3_instruction *instr = instr_create(block, nreg);
 	instr->block = block;
-	debug_assert(opc_cat(opc) == category);
 	instr->opc = opc;
 	insert_instr(block, instr);
 	return instr;
 }
 
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-		int category, opc_t opc)
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
 {
 	/* NOTE: we could be slightly more clever, at least for non-meta,
 	 * and choose # of regs based on category.
 	 */
-	return ir3_instr_create2(block, category, opc, 4);
+	return ir3_instr_create2(block, opc, 4);
 }
 
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 1391cbd97da..f268c2b38e9 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -443,10 +443,9 @@ void * ir3_alloc(struct ir3 *shader, int sz);
 
 struct ir3_block * ir3_block_create(struct ir3 *shader);
 
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-		int category, opc_t opc);
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-		int category, opc_t opc, int nreg);
+		opc_t opc, int nreg);
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
 const char *ir3_instr_name(struct ir3_instruction *instr);
 
@@ -900,8 +899,7 @@ void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary);
 static inline struct ir3_instruction *
 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
 {
-	struct ir3_instruction *instr =
-		ir3_instr_create(block, 1, OPC_MOV);
+	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
 	ir3_reg_create(instr, 0, 0);   /* dst */
 	if (src->regs[0]->flags & IR3_REG_ARRAY) {
 		struct ir3_register *src_reg =
@@ -921,8 +919,7 @@ static inline struct ir3_instruction *
 ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 		type_t src_type, type_t dst_type)
 {
-	struct ir3_instruction *instr =
-		ir3_instr_create(block, 1, OPC_MOV);
+	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
 	ir3_reg_create(instr, 0, 0);   /* dst */
 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 	instr->cat1.src_type = src_type;
@@ -934,45 +931,45 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 static inline struct ir3_instruction *
 ir3_NOP(struct ir3_block *block)
 {
-	return ir3_instr_create(block, 0, OPC_NOP);
+	return ir3_instr_create(block, OPC_NOP);
 }
 
-#define INSTR0(CAT, name)                                                \
+#define INSTR0(name)                                                     \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block)                                      \
 {                                                                        \
 	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, CAT, OPC_##name);                        \
+		ir3_instr_create(block, OPC_##name);                             \
 	return instr;                                                        \
 }
 
-#define INSTR1(CAT, name)                                                \
+#define INSTR1(name)                                                     \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block,                                      \
 		struct ir3_instruction *a, unsigned aflags)                      \
 {                                                                        \
 	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, CAT, OPC_##name);                        \
+		ir3_instr_create(block, OPC_##name);                             \
 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
 	return instr;                                                        \
 }
 
-#define INSTR2(CAT, name)                                                \
+#define INSTR2(name)                                                     \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block,                                      \
 		struct ir3_instruction *a, unsigned aflags,                      \
 		struct ir3_instruction *b, unsigned bflags)                      \
 {                                                                        \
 	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, CAT, OPC_##name);                        \
+		ir3_instr_create(block, OPC_##name);                             \
 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
 	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
 	return instr;                                                        \
 }
 
-#define INSTR3(CAT, name)                                                \
+#define INSTR3(name)                                                     \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block,                                      \
 		struct ir3_instruction *a, unsigned aflags,                      \
@@ -980,7 +977,7 @@ ir3_##name(struct ir3_block *block,                                      \
 		struct ir3_instruction *c, unsigned cflags)                      \
 {                                                                        \
 	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, CAT, OPC_##name);                        \
+		ir3_instr_create(block, OPC_##name);                             \
 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
 	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
@@ -989,89 +986,89 @@ ir3_##name(struct ir3_block *block,                                      \
 }
 
 /* cat0 instructions: */
-INSTR0(0, BR);
-INSTR0(0, JUMP);
-INSTR1(0, KILL);
-INSTR0(0, END);
+INSTR0(BR);
+INSTR0(JUMP);
+INSTR1(KILL);
+INSTR0(END);
 
 /* cat2 instructions, most 2 src but some 1 src: */
-INSTR2(2, ADD_F)
-INSTR2(2, MIN_F)
-INSTR2(2, MAX_F)
-INSTR2(2, MUL_F)
-INSTR1(2, SIGN_F)
-INSTR2(2, CMPS_F)
-INSTR1(2, ABSNEG_F)
-INSTR2(2, CMPV_F)
-INSTR1(2, FLOOR_F)
-INSTR1(2, CEIL_F)
-INSTR1(2, RNDNE_F)
-INSTR1(2, RNDAZ_F)
-INSTR1(2, TRUNC_F)
-INSTR2(2, ADD_U)
-INSTR2(2, ADD_S)
-INSTR2(2, SUB_U)
-INSTR2(2, SUB_S)
-INSTR2(2, CMPS_U)
-INSTR2(2, CMPS_S)
-INSTR2(2, MIN_U)
-INSTR2(2, MIN_S)
-INSTR2(2, MAX_U)
-INSTR2(2, MAX_S)
-INSTR1(2, ABSNEG_S)
-INSTR2(2, AND_B)
-INSTR2(2, OR_B)
-INSTR1(2, NOT_B)
-INSTR2(2, XOR_B)
-INSTR2(2, CMPV_U)
-INSTR2(2, CMPV_S)
-INSTR2(2, MUL_U)
-INSTR2(2, MUL_S)
-INSTR2(2, MULL_U)
-INSTR1(2, BFREV_B)
-INSTR1(2, CLZ_S)
-INSTR1(2, CLZ_B)
-INSTR2(2, SHL_B)
-INSTR2(2, SHR_B)
-INSTR2(2, ASHR_B)
-INSTR2(2, BARY_F)
-INSTR2(2, MGEN_B)
-INSTR2(2, GETBIT_B)
-INSTR1(2, SETRM)
-INSTR1(2, CBITS_B)
-INSTR2(2, SHB)
-INSTR2(2, MSAD)
+INSTR2(ADD_F)
+INSTR2(MIN_F)
+INSTR2(MAX_F)
+INSTR2(MUL_F)
+INSTR1(SIGN_F)
+INSTR2(CMPS_F)
+INSTR1(ABSNEG_F)
+INSTR2(CMPV_F)
+INSTR1(FLOOR_F)
+INSTR1(CEIL_F)
+INSTR1(RNDNE_F)
+INSTR1(RNDAZ_F)
+INSTR1(TRUNC_F)
+INSTR2(ADD_U)
+INSTR2(ADD_S)
+INSTR2(SUB_U)
+INSTR2(SUB_S)
+INSTR2(CMPS_U)
+INSTR2(CMPS_S)
+INSTR2(MIN_U)
+INSTR2(MIN_S)
+INSTR2(MAX_U)
+INSTR2(MAX_S)
+INSTR1(ABSNEG_S)
+INSTR2(AND_B)
+INSTR2(OR_B)
+INSTR1(NOT_B)
+INSTR2(XOR_B)
+INSTR2(CMPV_U)
+INSTR2(CMPV_S)
+INSTR2(MUL_U)
+INSTR2(MUL_S)
+INSTR2(MULL_U)
+INSTR1(BFREV_B)
+INSTR1(CLZ_S)
+INSTR1(CLZ_B)
+INSTR2(SHL_B)
+INSTR2(SHR_B)
+INSTR2(ASHR_B)
+INSTR2(BARY_F)
+INSTR2(MGEN_B)
+INSTR2(GETBIT_B)
+INSTR1(SETRM)
+INSTR1(CBITS_B)
+INSTR2(SHB)
+INSTR2(MSAD)
 
 /* cat3 instructions: */
-INSTR3(3, MAD_U16)
-INSTR3(3, MADSH_U16)
-INSTR3(3, MAD_S16)
-INSTR3(3, MADSH_M16)
-INSTR3(3, MAD_U24)
-INSTR3(3, MAD_S24)
-INSTR3(3, MAD_F16)
-INSTR3(3, MAD_F32)
-INSTR3(3, SEL_B16)
-INSTR3(3, SEL_B32)
-INSTR3(3, SEL_S16)
-INSTR3(3, SEL_S32)
-INSTR3(3, SEL_F16)
-INSTR3(3, SEL_F32)
-INSTR3(3, SAD_S16)
-INSTR3(3, SAD_S32)
+INSTR3(MAD_U16)
+INSTR3(MADSH_U16)
+INSTR3(MAD_S16)
+INSTR3(MADSH_M16)
+INSTR3(MAD_U24)
+INSTR3(MAD_S24)
+INSTR3(MAD_F16)
+INSTR3(MAD_F32)
+INSTR3(SEL_B16)
+INSTR3(SEL_B32)
+INSTR3(SEL_S16)
+INSTR3(SEL_S32)
+INSTR3(SEL_F16)
+INSTR3(SEL_F32)
+INSTR3(SAD_S16)
+INSTR3(SAD_S32)
 
 /* cat4 instructions: */
-INSTR1(4, RCP)
-INSTR1(4, RSQ)
-INSTR1(4, LOG2)
-INSTR1(4, EXP2)
-INSTR1(4, SIN)
-INSTR1(4, COS)
-INSTR1(4, SQRT)
+INSTR1(RCP)
+INSTR1(RSQ)
+INSTR1(LOG2)
+INSTR1(EXP2)
+INSTR1(SIN)
+INSTR1(COS)
+INSTR1(SQRT)
 
 /* cat5 instructions: */
-INSTR1(5, DSX)
-INSTR1(5, DSY)
+INSTR1(DSX)
+INSTR1(DSY)
 
 static inline struct ir3_instruction *
 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
@@ -1081,7 +1078,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 	struct ir3_instruction *sam;
 	struct ir3_register *reg;
 
-	sam = ir3_instr_create(block, 5, opc);
+	sam = ir3_instr_create(block, opc);
 	sam->flags |= flags;
 	ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
 	if (src0) {
@@ -1102,9 +1099,9 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 }
 
 /* cat6 instructions: */
-INSTR2(6, LDLV)
-INSTR2(6, LDG)
-INSTR3(6, STG)
+INSTR2(LDLV)
+INSTR2(LDG)
+INSTR3(STG)
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 1f043e5d9d6..3f14412998c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -286,7 +286,7 @@ create_immed(struct ir3_block *block, uint32_t val)
 {
 	struct ir3_instruction *mov;
 
-	mov = ir3_instr_create(block, 1, OPC_MOV);
+	mov = ir3_instr_create(block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -366,7 +366,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n)
 {
 	struct ir3_instruction *mov;
 
-	mov = ir3_instr_create(ctx->block, 1, OPC_MOV);
+	mov = ir3_instr_create(ctx->block, OPC_MOV);
 	/* TODO get types right? */
 	mov->cat1.src_type = TYPE_F32;
 	mov->cat1.dst_type = TYPE_F32;
@@ -382,7 +382,7 @@ create_uniform_indirect(struct ir3_compile *ctx, int n,
 {
 	struct ir3_instruction *mov;
 
-	mov = ir3_instr_create(ctx->block, 1, OPC_MOV);
+	mov = ir3_instr_create(ctx->block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -402,7 +402,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 	if (arrsz == 0)
 		return NULL;
 
-	collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz);
+	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
 	ir3_reg_create(collect, 0, 0);     /* dst */
 	for (unsigned i = 0; i < arrsz; i++)
 		ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
@@ -418,7 +418,7 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
 	struct ir3_instruction *mov;
 	struct ir3_register *src;
 
-	mov = ir3_instr_create(block, 1, OPC_MOV);
+	mov = ir3_instr_create(block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -441,7 +441,7 @@ create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
 	struct ir3_instruction *mov;
 	struct ir3_register *src;
 
-	mov = ir3_instr_create(block, 1, OPC_MOV);
+	mov = ir3_instr_create(block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
@@ -469,7 +469,7 @@ create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
 	struct ir3_instruction *mov;
 	struct ir3_register *dst;
 
-	mov = ir3_instr_create(block, 1, OPC_MOV);
+	mov = ir3_instr_create(block, OPC_MOV);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
@@ -492,7 +492,7 @@ create_input(struct ir3_block *block, unsigned n)
 {
 	struct ir3_instruction *in;
 
-	in = ir3_instr_create(block, -1, OPC_META_INPUT);
+	in = ir3_instr_create(block, OPC_META_INPUT);
 	in->inout.block = block;
 	ir3_reg_create(in, n, 0);
 
@@ -617,8 +617,7 @@ split_dest(struct ir3_block *block, struct ir3_instruction **dst,
 {
 	struct ir3_instruction *prev = NULL;
 	for (int i = 0, j = 0; i < n; i++) {
-		struct ir3_instruction *split =
-				ir3_instr_create(block, -1, OPC_META_FO);
+		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
 		ir3_reg_create(split, 0, IR3_REG_SSA);
 		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
 		split->fo.off = i;
@@ -1631,7 +1630,7 @@ emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
 
 	dst = get_dst(ctx, &nphi->dest, 1);
 
-	phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI,
+	phi = ir3_instr_create2(ctx->block, OPC_META_PHI,
 			1 + exec_list_length(&nphi->srcs));
 	ir3_reg_create(phi, 0, 0);         /* dst */
 	phi->phi.nphi = nphi;
@@ -2144,7 +2143,7 @@ emit_instructions(struct ir3_compile *ctx)
 	if (ctx->so->type == SHADER_FRAGMENT) {
 		// TODO maybe a helper for fi since we need it a few places..
 		struct ir3_instruction *instr;
-		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+		instr = ir3_instr_create(ctx->block, OPC_META_FI);
 		ir3_reg_create(instr, 0, 0);
 		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
 		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 70212968d89..cd59080b0f1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -63,7 +63,7 @@ static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
 
 	debug_assert(instr->regs_count == 1);
 
-	in = ir3_instr_create(instr->block, -1, OPC_META_INPUT);
+	in = ir3_instr_create(instr->block, OPC_META_INPUT);
 	in->inout.block = instr->block;
 	ir3_reg_create(in, instr->regs[0]->num, 0);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 613588d276f..77cd0e622f0 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -209,7 +209,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			struct ir3_instruction *baryf;
 
 			/* (ss)bary.f (ei)r63.x, 0, r0.x */
-			baryf = ir3_instr_create(block, 2, OPC_BARY_F);
+			baryf = ir3_instr_create(block, OPC_BARY_F);
 			baryf->flags |= IR3_INSTR_SS;
 			ir3_reg_create(baryf, regid(63, 0), 0);
 			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;

From d47fb856af4da5f56f80e072365b9286f0731a54 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Apr 2016 14:16:12 -0400
Subject: [PATCH 10/72] freedreno/ir3: add dumping for use/def/live-in/live-out

Turned out to be useful to debug an issue in RA.  Let's keep it.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.h       | 10 ++++++
 src/gallium/drivers/freedreno/ir3/ir3_print.c | 11 +------
 src/gallium/drivers/freedreno/ir3/ir3_ra.c    | 31 +++++++++++++++++++
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index f268c2b38e9..23e43b1e13d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -434,6 +434,16 @@ struct ir3_block {
 #endif
 };
 
+static inline uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(unsigned long)block;
+#endif
+}
+
 struct ir3 * ir3_create(struct ir3_compiler *compiler,
 		unsigned nin, unsigned nout);
 void ir3_destroy(struct ir3 *shader);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index 86418b89820..8aebf21a1be 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -40,6 +40,7 @@ static void print_instr_name(struct ir3_instruction *instr)
 #ifdef DEBUG
 	printf("%04u:", instr->serialno);
 #endif
+	printf("%04u:", instr->name);
 	printf("%03u: ", instr->depth);
 
 	if (instr->flags & IR3_INSTR_SY)
@@ -148,16 +149,6 @@ tab(int lvl)
 		printf("\t");
 }
 
-static uint32_t
-block_id(struct ir3_block *block)
-{
-#ifdef DEBUG
-	return block->serialno;
-#else
-	return (uint32_t)(unsigned long)block;
-#endif
-}
-
 static void
 print_instr(struct ir3_instruction *instr, int lvl)
 {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 2f369e677e0..f06b27e9dd2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -31,6 +31,8 @@
 #include "util/ralloc.h"
 #include "util/bitset.h"
 
+#include "freedreno_util.h"
+
 #include "ir3.h"
 #include "ir3_compiler.h"
 
@@ -809,6 +811,22 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
 	return progress;
 }
 
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
+{
+	bool first = true;
+	debug_printf("  %s:", name);
+	for (unsigned i = 0; i < cnt; i++) {
+		if (BITSET_TEST(bs, i)) {
+			if (!first)
+				debug_printf(",");
+			debug_printf(" %04u", i);
+			first = false;
+		}
+	}
+	debug_printf("\n");
+}
+
 static void
 ra_add_interference(struct ir3_ra_ctx *ctx)
 {
@@ -831,6 +849,19 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* update per-block livein/liveout: */
 	while (ra_compute_livein_liveout(ctx)) {}
 
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		debug_printf("AFTER LIVEIN/OUT:\n");
+		ir3_print(ir);
+		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+			struct ir3_ra_block_data *bd = block->data;
+			debug_printf("block%u:\n", block_id(block));
+			print_bitset("def", bd->def, ctx->alloc_count);
+			print_bitset("use", bd->use, ctx->alloc_count);
+			print_bitset("l/i", bd->livein, ctx->alloc_count);
+			print_bitset("l/o", bd->liveout, ctx->alloc_count);
+		}
+	}
+
 	/* extend start/end ranges based on livein/liveout info from cfg: */
 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {

From 383b6e87f90e0ac84a200e1677a44b370976c93b Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Apr 2016 17:33:19 -0400
Subject: [PATCH 11/72] freedreno/ir3: we can't store immediate values

Fixes some transform-feedback piglits, like:

bin/ext_transform_feedback-nonflat-integral

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_cp.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 48870074514..00fa3538cc0 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -111,6 +111,19 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 		valid_flags = IR3_REG_IMMED;
 		if (flags & ~valid_flags)
 			return false;
+
+		if (flags & IR3_REG_IMMED) {
+			/* doesn't seem like we can have immediate src for store
+			 * instructions:
+			 *
+			 * TODO this restriction could also apply to load instructions,
+			 * but for load instructions this arg is the address (and not
+			 * really sure any good way to test a hard-coded immed addr src)
+			 */
+			if (is_store(instr) && (n == 1))
+				return false;
+		}
+
 		break;
 	case 2:
 		valid_flags = ir3_cat2_absneg(instr->opc) |

From 8e451c2d06d18ee54dc3098b3987af6e0bc59f5e Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Apr 2016 17:34:57 -0400
Subject: [PATCH 12/72] freedreno/ir3: don't cp into phi's

The block defining a phi source might not have been executed.  If we
allow copy propagation, we could end up pointing to a src instruction in
the wrong block.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_cp.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 00fa3538cc0..f032f0bd53f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -230,6 +230,12 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 {
 	struct ir3_instruction *src = ssa(reg);
 
+	/* don't propagate copies into a PHI, since we don't know if the
+	 * src block executed:
+	 */
+	if (instr->opc == OPC_META_PHI)
+		return;
+
 	if (is_eligible_mov(src, true)) {
 		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
 		struct ir3_register *src_reg = src->regs[1];

From f8feb97ba5be50cd9bfe63a4d3b0ba35010a0d32 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Apr 2016 17:36:41 -0400
Subject: [PATCH 13/72] freedreno/ir3: fix silly brain-fart in RA

We want to consider all the vars, not 1/32nd of them, when extending
live-ranges.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index f06b27e9dd2..ed3030d722a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -863,11 +863,10 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	}
 
 	/* extend start/end ranges based on livein/liveout info from cfg: */
-	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		struct ir3_ra_block_data *bd = block->data;
 
-		for (unsigned i = 0; i < bitset_words; i++) {
+		for (unsigned i = 0; i < ctx->alloc_count; i++) {
 			if (BITSET_TEST(bd->livein, i)) {
 				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
 				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);

From 3e135728268cf36a176dcd915108ad7dc0f4e457 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Apr 2016 17:38:01 -0400
Subject: [PATCH 14/72] freedreno/ir3: deal with duplicate phi sources

Otherwise we end up with funny things like:

  mov.f32f32 r0.x, r1.y
  mov.f32f32 r0.x, r1.y

(It doesn't happen as much after fixing the problem w/ CP into phi src,
but it can still happen since we aren't too clever about generating phi
sources in the first place.)

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 9be5ca34ccd..b56da304f92 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -627,13 +627,28 @@ sched_insert_parallel_copies(struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		if (instr->opc == OPC_META_PHI) {
-			struct ir3_register *reg;
+			struct ir3_register *reg, *reg2;
 			foreach_src(reg, instr) {
 				struct ir3_instruction *src = reg->instr;
-				struct ir3_instruction *mov =
-					ir3_MOV(src->block, src, TYPE_U32);
-				mov->regs[0]->flags |= IR3_REG_PHI_SRC;
-				mov->regs[0]->instr = instr;
+				struct ir3_instruction *mov = NULL;
+
+				/* after CP we could end up w/ duplicate phi srcs: */
+				foreach_src(reg2, instr) {
+					if (reg == reg2)
+						break;
+					/* reg2 is before reg1 so already an inserted mov: */
+					else if (reg2->instr->regs[1]->instr == src) {
+						mov = reg2->instr;
+						break;
+					}
+				}
+
+				if (!mov) {
+					mov = ir3_MOV(src->block, src, TYPE_U32);
+					mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+					mov->regs[0]->instr = instr;
+				}
+
 				reg->instr = mov;
 			}
 		}

From 70299474f58ad7c0299ea5f997019880d1c4deef Mon Sep 17 00:00:00 2001
From: Dongwon Kim <dongwon.kim@intel.com>
Date: Mon, 4 Apr 2016 17:14:10 -0700
Subject: [PATCH 15/72] egl: add EGL_KHR_reusable_sync to egl_dri
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch enables an EGL extension, EGL_KHR_reusable_sync.
This new extension basically provides a way for multiple APIs or
threads to be excuted synchronously via a "reusable sync"
primitive shared by those threads/API calls.

This was implemented based on the specification at

https://www.khronos.org/registry/egl/extensions/KHR/EGL_KHR_reusable_sync.txt

v2
- use thread functions defined in C11/threads.h instead of
  using direct pthread calls
- make the timeout set with reference to CLOCK_MONOTONIC
- cleaned up the way expiration time is calculated
- (bug fix) in dri2_client_wait_sync, case EGL_SYNC_CL_EVENT_KHR
  has been added.
- (bug fix) in dri2_destroy_sync, return from cond_broadcast
  call is now stored in 'err' intead of 'ret' to prevent 'ret'
  from being reset to 'EGL_FALSE' even in successful case
- corrected minor syntax problems

v3
- dri2_egl_unref_sync now became 'void' type. No more error check
  is needed for this function call as a result.
- (bug fix) resolved issue with duplicated unlocking of display in
  eglClientWaitSync when type of sync is "EGL_KHR_REUSABLE_SYNC"

Signed-off-by: Dongwon Kim <dongwon.kim@intel.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/egl/drivers/dri2/egl_dri2.c | 192 +++++++++++++++++++++++++++++++-
 src/egl/drivers/dri2/egl_dri2.h |   2 +
 src/egl/main/eglapi.c           |  17 ++-
 src/egl/main/eglsync.c          |   3 +-
 4 files changed, 206 insertions(+), 8 deletions(-)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 8f50f0ce573..490b0409c98 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -38,6 +38,8 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
+#include <c11/threads.h>
+#include <time.h>
 #ifdef HAVE_LIBDRM
 #include <xf86drm.h>
 #include <drm_fourcc.h>
@@ -623,6 +625,8 @@ dri2_setup_screen(_EGLDisplay *disp)
          disp->Extensions.KHR_cl_event2 = EGL_TRUE;
    }
 
+   disp->Extensions.KHR_reusable_sync = EGL_TRUE;
+
    if (dri2_dpy->image) {
       if (dri2_dpy->image->base.version >= 10 &&
           dri2_dpy->image->getCapabilities != NULL) {
@@ -2394,7 +2398,12 @@ dri2_egl_unref_sync(struct dri2_egl_display *dri2_dpy,
                     struct dri2_egl_sync *dri2_sync)
 {
    if (p_atomic_dec_zero(&dri2_sync->refcount)) {
-      dri2_dpy->fence->destroy_fence(dri2_dpy->dri_screen, dri2_sync->fence);
+      if (dri2_sync->base.Type == EGL_SYNC_REUSABLE_KHR)
+         cnd_destroy(&dri2_sync->cond);
+
+      if (dri2_sync->fence)
+         dri2_dpy->fence->destroy_fence(dri2_dpy->dri_screen, dri2_sync->fence);
+
       free(dri2_sync);
    }
 }
@@ -2408,6 +2417,8 @@ dri2_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
    struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
    struct dri2_egl_sync *dri2_sync;
+   EGLint ret;
+   pthread_condattr_t attr;
 
    dri2_sync = calloc(1, sizeof(struct dri2_egl_sync));
    if (!dri2_sync) {
@@ -2450,6 +2461,37 @@ dri2_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
                                             dri2_sync->fence, 0, 0))
          dri2_sync->base.SyncStatus = EGL_SIGNALED_KHR;
       break;
+
+   case EGL_SYNC_REUSABLE_KHR:
+      /* intialize attr */
+      ret = pthread_condattr_init(&attr);
+
+      if (ret) {
+         _eglError(EGL_BAD_ACCESS, "eglCreateSyncKHR");
+         free(dri2_sync);
+         return NULL;
+      }
+
+      /* change clock attribute to CLOCK_MONOTONIC */
+      ret = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+
+      if (ret) {
+         _eglError(EGL_BAD_ACCESS, "eglCreateSyncKHR");
+         free(dri2_sync);
+         return NULL;
+      }
+
+      ret = pthread_cond_init(&dri2_sync->cond, &attr);
+
+      if (ret) {
+         _eglError(EGL_BAD_ACCESS, "eglCreateSyncKHR");
+         free(dri2_sync);
+         return NULL;
+      }
+
+      /* initial status of reusable sync must be "unsignaled" */
+      dri2_sync->base.SyncStatus = EGL_UNSIGNALED_KHR;
+      break;
    }
 
    p_atomic_set(&dri2_sync->refcount, 1);
@@ -2461,9 +2503,27 @@ dri2_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
    struct dri2_egl_sync *dri2_sync = dri2_egl_sync(sync);
+   EGLint ret = EGL_TRUE;
+   EGLint err;
 
+   /* if type of sync is EGL_SYNC_REUSABLE_KHR and it is not signaled yet,
+    * then unlock all threads possibly blocked by the reusable sync before
+    * destroying it.
+    */
+   if (dri2_sync->base.Type == EGL_SYNC_REUSABLE_KHR &&
+       dri2_sync->base.SyncStatus == EGL_UNSIGNALED_KHR) {
+      dri2_sync->base.SyncStatus = EGL_SIGNALED_KHR;
+      /* unblock all threads currently blocked by sync */
+      err = cnd_broadcast(&dri2_sync->cond);
+
+      if (err) {
+         _eglError(EGL_BAD_ACCESS, "eglDestroySyncKHR");
+         ret = EGL_FALSE;
+      }
+   }
    dri2_egl_unref_sync(dri2_dpy, dri2_sync);
-   return EGL_TRUE;
+
+   return ret;
 }
 
 static EGLint
@@ -2471,10 +2531,16 @@ dri2_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
                       EGLint flags, EGLTime timeout)
 {
    _EGLContext *ctx = _eglGetCurrentContext();
+   struct dri2_egl_driver *dri2_drv = dri2_egl_driver(drv);
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
    struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
    struct dri2_egl_sync *dri2_sync = dri2_egl_sync(sync);
    unsigned wait_flags = 0;
+
+   /* timespecs for cnd_timedwait */
+   struct timespec current;
+   xtime expire;
+
    EGLint ret = EGL_CONDITION_SATISFIED_KHR;
 
    /* The EGL_KHR_fence_sync spec states:
@@ -2488,17 +2554,130 @@ dri2_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
    /* the sync object should take a reference while waiting */
    dri2_egl_ref_sync(dri2_sync);
 
-   if (dri2_dpy->fence->client_wait_sync(dri2_ctx ? dri2_ctx->dri_context : NULL,
+   switch (sync->Type) {
+   case EGL_SYNC_FENCE_KHR:
+   case EGL_SYNC_CL_EVENT_KHR:
+      if (dri2_dpy->fence->client_wait_sync(dri2_ctx ? dri2_ctx->dri_context : NULL,
                                          dri2_sync->fence, wait_flags,
                                          timeout))
-      dri2_sync->base.SyncStatus = EGL_SIGNALED_KHR;
-   else
-      ret = EGL_TIMEOUT_EXPIRED_KHR;
+         dri2_sync->base.SyncStatus = EGL_SIGNALED_KHR;
+      else
+         ret = EGL_TIMEOUT_EXPIRED_KHR;
+      break;
 
+   case EGL_SYNC_REUSABLE_KHR:
+      if (dri2_ctx && dri2_sync->base.SyncStatus == EGL_UNSIGNALED_KHR &&
+          (flags & EGL_SYNC_FLUSH_COMMANDS_BIT_KHR)) {
+         /* flush context if EGL_SYNC_FLUSH_COMMANDS_BIT_KHR is set */
+         if (dri2_drv->glFlush)
+            dri2_drv->glFlush();
+      }
+
+      /* if timeout is EGL_FOREVER_KHR, it should wait without any timeout.*/
+      if (timeout == EGL_FOREVER_KHR) {
+         if (mtx_lock(&dri2_sync->mutex)) {
+            ret = EGL_FALSE;
+            goto cleanup;
+         }
+
+         ret = cnd_wait(&dri2_sync->cond, &dri2_sync->mutex);
+
+         if (mtx_unlock(&dri2_sync->mutex)) {
+            ret = EGL_FALSE;
+            goto cleanup;
+         }
+
+         if (ret) {
+            _eglError(EGL_BAD_PARAMETER, "eglClientWaitSyncKHR");
+            ret = EGL_FALSE;
+         }
+      } else {
+         /* if reusable sync has not been yet signaled */
+         if (dri2_sync->base.SyncStatus != EGL_SIGNALED_KHR) {
+            clock_gettime(CLOCK_MONOTONIC, &current);
+
+            /* calculating when to expire */
+            expire.nsec = timeout % 1000000000L;
+            expire.sec = timeout / 1000000000L;
+
+            expire.nsec += current.tv_nsec;
+            expire.sec += current.tv_sec;
+
+            /* expire.nsec now is a number between 0 and 1999999998 */
+            if (expire.nsec > 999999999L) {
+               expire.sec++;
+               expire.nsec -= 1000000000L;
+            }
+
+            if (mtx_lock(&dri2_sync->mutex)) {
+               ret = EGL_FALSE;
+               goto cleanup;
+            }
+
+            ret = cnd_timedwait(&dri2_sync->cond, &dri2_sync->mutex, &expire);
+
+            if (mtx_unlock(&dri2_sync->mutex)) {
+               ret = EGL_FALSE;
+               goto cleanup;
+            }
+
+            if (ret)
+               if (ret == thrd_busy) {
+                  if (dri2_sync->base.SyncStatus == EGL_UNSIGNALED_KHR) {
+                     ret = EGL_TIMEOUT_EXPIRED_KHR;
+                  } else {
+                     _eglError(EGL_BAD_ACCESS, "eglClientWaitSyncKHR");
+                     ret = EGL_FALSE;
+                  }
+               }
+         }
+      }
+      break;
+  }
+
+ cleanup:
    dri2_egl_unref_sync(dri2_dpy, dri2_sync);
+
+   if (ret == EGL_FALSE) {
+      _eglError(EGL_BAD_ACCESS, "eglClientWaitSyncKHR");
+      return EGL_FALSE;
+   }
+
    return ret;
 }
 
+static EGLBoolean
+dri2_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                      EGLenum mode)
+{
+   struct dri2_egl_sync *dri2_sync = dri2_egl_sync(sync);
+   EGLint ret;
+
+   if (sync->Type != EGL_SYNC_REUSABLE_KHR) {
+      _eglError(EGL_BAD_MATCH, "eglSignalSyncKHR");
+      return EGL_FALSE;
+   }
+
+   if (mode != EGL_SIGNALED_KHR && mode != EGL_UNSIGNALED_KHR) {
+      _eglError(EGL_BAD_ATTRIBUTE, "eglSignalSyncKHR");
+      return EGL_FALSE;
+   }
+
+   dri2_sync->base.SyncStatus = mode;
+
+   if (mode == EGL_SIGNALED_KHR) {
+      ret = cnd_broadcast(&dri2_sync->cond);
+
+      /* fail to broadcast */
+      if (ret) {
+         _eglError(EGL_BAD_ACCESS, "eglSignalSyncKHR");
+         return EGL_FALSE;
+      }
+   }
+
+   return EGL_TRUE;
+}
+
 static EGLint
 dri2_server_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync)
 {
@@ -2620,6 +2799,7 @@ _eglBuiltInDriverDRI2(const char *args)
    dri2_drv->base.API.GetSyncValuesCHROMIUM = dri2_get_sync_values_chromium;
    dri2_drv->base.API.CreateSyncKHR = dri2_create_sync;
    dri2_drv->base.API.ClientWaitSyncKHR = dri2_client_wait_sync;
+   dri2_drv->base.API.SignalSyncKHR = dri2_signal_sync;
    dri2_drv->base.API.WaitSyncKHR = dri2_server_wait_sync;
    dri2_drv->base.API.DestroySyncKHR = dri2_destroy_sync;
 
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 52ad92b182d..ef799398474 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -307,6 +307,8 @@ struct dri2_egl_image
 
 struct dri2_egl_sync {
    _EGLSync base;
+   mtx_t mutex;
+   cnd_t cond;
    int refcount;
    void *fence;
 };
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 8886759011a..64ffe92be43 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -1469,9 +1469,24 @@ eglClientWaitSync(EGLDisplay dpy, EGLSync sync, EGLint flags, EGLTime timeout)
    if (s->SyncStatus == EGL_SIGNALED_KHR)
       RETURN_EGL_EVAL(disp, EGL_CONDITION_SATISFIED_KHR);
 
+   /* if sync type is EGL_SYNC_REUSABLE_KHR, dpy should be
+    * unlocked here to allow other threads also to be able to
+    * go into waiting state.
+    */
+
+   if (s->Type == EGL_SYNC_REUSABLE_KHR)
+      _eglUnlockDisplay(dpy);
+
    ret = drv->API.ClientWaitSyncKHR(drv, disp, s, flags, timeout);
 
-   RETURN_EGL_EVAL(disp, ret);
+   /*
+    * 'disp' is already unlocked for reusable sync type,
+    * so passing 'NULL' to bypass unlocking display.
+    */
+   if (s->Type == EGL_SYNC_REUSABLE_KHR)
+      RETURN_EGL_EVAL(NULL, ret);
+   else
+      RETURN_EGL_EVAL(disp, ret);
 }
 
 
diff --git a/src/egl/main/eglsync.c b/src/egl/main/eglsync.c
index 999cb480c4b..33625e97ae3 100644
--- a/src/egl/main/eglsync.c
+++ b/src/egl/main/eglsync.c
@@ -152,7 +152,8 @@ _eglGetSyncAttrib(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
       /* update the sync status */
       if (sync->SyncStatus != EGL_SIGNALED_KHR &&
           (sync->Type == EGL_SYNC_FENCE_KHR ||
-           sync->Type == EGL_SYNC_CL_EVENT_KHR))
+           sync->Type == EGL_SYNC_CL_EVENT_KHR ||
+	   sync->Type == EGL_SYNC_REUSABLE_KHR))
          drv->API.ClientWaitSyncKHR(drv, dpy, sync, 0, 0);
 
       *value = sync->SyncStatus;

From 25f96d2b9766afa61a518667268e3134f43451e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 5 Apr 2016 15:32:16 +0200
Subject: [PATCH 16/72] docs/relnotes: document EGL_KHR_reusable_sync

---
 docs/relnotes/11.3.0.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 6f8fcfbde29..d494b84d568 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -53,6 +53,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_OES_draw_buffers_indexed and GL_EXT_draw_buffers_indexed on all drivers that support GL_ARB_draw_buffers_blend</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
+<li>EGL_KHR_reusable_sync on all drivers</li>
 </ul>
 
 <h2>Bug fixes</h2>

From a64dbdf612805daff0bbc70bba26053bd226ae70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 29 Mar 2016 01:36:40 +0200
Subject: [PATCH 17/72] gallium/radeon: allow multiple exports of the same
 texture with different usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of failing an assertion, disable DCC and CMASK on the first export
that needs it, and merge the external usage flags.

v2: clear the EXPLICIT_FLUSH flag if it's not set; whitespace fixes

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeon/r600_texture.c | 54 ++++++++++++++---------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 83fc0021227..4850b73f291 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -329,6 +329,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	struct r600_resource *res = (struct r600_resource*)resource;
 	struct r600_texture *rtex = (struct r600_texture*)resource;
 	struct radeon_bo_metadata metadata;
+	bool update_metadata = false;
 
 	/* This is not supported now, but it might be required for OpenCL
 	 * interop in the future.
@@ -337,29 +338,30 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	    (resource->nr_samples > 1 || rtex->is_depth))
 		return false;
 
-	if (!res->is_shared) {
-		res->is_shared = true;
-		res->external_usage = usage;
+	if (resource->target != PIPE_BUFFER) {
+		/* Since shader image stores don't support DCC on VI,
+		 * disable it for external clients that want write
+		 * access.
+		 */
+		if (usage & PIPE_HANDLE_USAGE_WRITE && rtex->dcc_offset) {
+			r600_texture_disable_dcc(rscreen, rtex);
+			update_metadata = true;
+		}
 
-		if (resource->target != PIPE_BUFFER) {
-			/* Since shader image stores don't support DCC on VI,
-			 * disable it for external clients that want write
-			 * access.
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+		    rtex->cmask.size) {
+			/* Eliminate fast clear (both CMASK and DCC) */
+			r600_eliminate_fast_color_clear(rscreen, rtex);
+
+			/* Disable CMASK if flush_resource isn't going
+			 * to be called.
 			 */
-			if (usage & PIPE_HANDLE_USAGE_WRITE)
-				r600_texture_disable_dcc(rscreen, rtex);
+			r600_texture_disable_cmask(rscreen, rtex);
+			update_metadata = true;
+		}
 
-			if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) {
-				/* Eliminate fast clear (both CMASK and DCC) */
-				r600_eliminate_fast_color_clear(rscreen, rtex);
-
-				/* Disable CMASK if flush_resource isn't going
-				 * to be called.
-				 */
-				r600_texture_disable_cmask(rscreen, rtex);
-			}
-
-			/* Set metadata. */
+		/* Set metadata. */
+		if (!res->is_shared || update_metadata) {
 			r600_texture_init_metadata(rtex, &metadata);
 			if (rscreen->query_opaque_metadata)
 				rscreen->query_opaque_metadata(rscreen, rtex,
@@ -367,8 +369,18 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 
 			rscreen->ws->buffer_set_metadata(res->buf, &metadata);
 		}
+	}
+
+	if (res->is_shared) {
+		/* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+		 * doesn't set it.
+		 */
+		res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
 	} else {
-		assert(res->external_usage == usage);
+		res->is_shared = true;
+		res->external_usage = usage;
 	}
 
 	return rscreen->ws->buffer_get_handle(res->buf,

From 713353db182dbf5d9be802aa2c1ec7d6debd07a9 Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Sun, 3 Apr 2016 11:39:52 +0200
Subject: [PATCH 18/72] radeonsi: use bounded indexing for constant buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 56c575948ab..ca2ff4dfcae 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1328,8 +1328,9 @@ static LLVMValueRef fetch_constant(
 	if (reg->Register.Dimension && reg->Dimension.Indirect) {
 		LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
 		LLVMValueRef index;
-		index = get_indirect_index(ctx, &reg->DimIndirect,
-						   reg->Dimension.Index);
+		index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
+						   reg->Dimension.Index,
+						   SI_NUM_USER_CONST_BUFFERS);
 		bufp = build_indexed_load_const(ctx, ptr, index);
 	} else
 		bufp = ctx->const_buffers[buf];

From 799789ba99f4bd27119cf46cc0e7f5384ec3d01e Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Sun, 3 Apr 2016 11:45:02 +0200
Subject: [PATCH 19/72] radeonsi: use bounded indexing for samplers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ca2ff4dfcae..bf3f00867e9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3357,7 +3357,10 @@ static void tex_fetch_ptrs(
 		const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
 		LLVMValueRef ind_index;
 
-		ind_index = get_indirect_index(ctx, &reg->Indirect, reg->Register.Index);
+		ind_index = get_bounded_indirect_index(ctx,
+						       &reg->Indirect,
+						       reg->Register.Index,
+						       SI_NUM_USER_SAMPLERS);
 
 		*res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
 

From 0daab9878d2b96356cf667591a2c877d912be52d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Tue, 5 Apr 2016 15:43:35 +0900
Subject: [PATCH 20/72] clover: Fix build against clang SVN >= r265359
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer <michel.daenzer@amd.com>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
---
 src/gallium/state_trackers/clover/llvm/invocation.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 4d11c2477c7..3fb35969a81 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -206,6 +206,9 @@ namespace {
       // http://www.llvm.org/bugs/show_bug.cgi?id=19735
       c.getDiagnosticOpts().ShowCarets = false;
       c.getInvocation().setLangDefaults(c.getLangOpts(), clang::IK_OpenCL,
+#if HAVE_LLVM >= 0x0309
+                                        llvm::Triple(triple),
+#endif
                                         clang::LangStandard::lang_opencl11);
       c.createDiagnostics(
                           new clang::TextDiagnosticPrinter(

From f9cdbf44054009122fcc16c887fb90ccc33b52c9 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 5 Apr 2016 12:39:47 -0400
Subject: [PATCH 21/72] freedreno/ir3: eliminate unnecessary absneg's

The frontend inserts (abs) and (neg)'s to convert between NIR boolean
(~0/0) and native boolean (1/0).  So we'd end up with things like:

   cmps.s.ge r1.x, ...
   absneg.s r1.x, (neg)r1.x
   absneg.s r1.x, (abs)r1.x
   sel.b32 r2.x, r0.x, r1.x, r0.y

The (neg) already gets collapsed due to the following (abs).  Now by
realizing that r1.x comes from a cmps.s instruction, we can drop the
(abs) as well.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.h    | 12 ++++++++++++
 src/gallium/drivers/freedreno/ir3/ir3_cp.c | 17 ++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 23e43b1e13d..3859f6a39f3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -628,6 +628,18 @@ static inline bool is_input(struct ir3_instruction *instr)
 	}
 }
 
+static inline bool is_bool(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_S:
+	case OPC_CMPS_U:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline bool is_meta(struct ir3_instruction *instr)
 {
 	/* TODO how should we count PHI (and maybe fan-in/out) which
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index f032f0bd53f..6037becf22f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -189,8 +189,10 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 /* propagate register flags from src to dst.. negates need special
  * handling to cancel each other out.
  */
-static void combine_flags(unsigned *dstflags, unsigned srcflags)
+static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
 {
+	unsigned srcflags = src->regs[1]->flags;
+
 	/* if what we are combining into already has (abs) flags,
 	 * we can drop (neg) from src:
 	 */
@@ -216,6 +218,15 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
 	*dstflags |= srcflags & IR3_REG_IMMED;
 	*dstflags |= srcflags & IR3_REG_RELATIV;
 	*dstflags |= srcflags & IR3_REG_ARRAY;
+
+	/* if src of the src is boolean we can drop the (abs) since we know
+	 * the source value is already a postitive integer.  This cleans
+	 * up the absnegs that get inserted when converting between nir and
+	 * native boolean (see ir3_b2n/n2b)
+	 */
+	struct ir3_instruction *srcsrc = ssa(src->regs[1]);
+	if (srcsrc && is_bool(srcsrc))
+		*dstflags &= ~IR3_REG_SABS;
 }
 
 /**
@@ -241,7 +252,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 		struct ir3_register *src_reg = src->regs[1];
 		unsigned new_flags = reg->flags;
 
-		combine_flags(&new_flags, src_reg->flags);
+		combine_flags(&new_flags, src);
 
 		if (valid_flags(instr, n, new_flags)) {
 			if (new_flags & IR3_REG_ARRAY) {
@@ -262,7 +273,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 		struct ir3_register *src_reg = src->regs[1];
 		unsigned new_flags = reg->flags;
 
-		combine_flags(&new_flags, src_reg->flags);
+		combine_flags(&new_flags, src);
 
 		if (!valid_flags(instr, n, new_flags)) {
 			/* special case for "normal" mad instructions, we can

From 506b561ba7e3df2a7759dded684fae84bf459f65 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 5 Apr 2016 13:45:34 -0400
Subject: [PATCH 22/72] freedreno/ir3: insert extra move into phi

We had an implicit assumption that the phi src was assigned in it's
source (pred) block leading into the phi.  But this is not true with
NIR, so we can't just ignore the source block specified in the
nir_phi_src.  Insert an extra mov in the source block.  If it is not
required the CP pass will take it back out again.

Fixes:

  ./tests/spec/glsl-1.10/execution/vs-call-in-nested-loop.shader_test
  ./tests/spec/glsl-1.10/execution/vs-inner-loop-modifies-outer-loop-var.shader_test

and probably others.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 3f14412998c..245b61f31e5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1661,6 +1661,16 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
 
 		foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
 			struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
+
+			/* NOTE: src might not be in the same block as it comes from
+			 * according to the phi.. but in the end the backend assumes
+			 * it will be able to assign the same register to each (which
+			 * only works if it is assigned in the src block), so insert
+			 * an extra mov to make sure the phi src is assigned in the
+			 * block it comes from:
+			 */
+			src = ir3_MOV(get_block(ctx, nsrc->pred), src, TYPE_U32);
+
 			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 		}
 	}

From f1293b2f9bc3a45c71941931edb5148d7b5f5a27 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sun, 3 Apr 2016 12:44:33 +1000
Subject: [PATCH 23/72] glsl: fully split apart buffer block arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this change we create the UBO and SSBO arrays separately from the
beginning rather than putting them into a combined array and splitting
it apart later.

A bug is with UBO and SSBO stage reference querying is also fixed as
we now use the block index to lookup the references in the separate arrays
not the combined buffer block array.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/compiler/glsl/link_uniform_blocks.cpp     | 206 +++++++++-------
 .../glsl/link_uniform_initializers.cpp        |  17 +-
 src/compiler/glsl/link_uniforms.cpp           |  32 +--
 src/compiler/glsl/linker.cpp                  | 225 ++++++++----------
 src/compiler/glsl/linker.h                    |   7 +-
 src/compiler/glsl/standalone_scaffolding.cpp  |   4 -
 src/mesa/main/mtypes.h                        |  50 +---
 src/mesa/main/shader_query.cpp                |   7 +-
 src/mesa/main/shaderapi.c                     |   2 +-
 src/mesa/main/shaderobj.c                     |  10 +-
 src/mesa/main/uniforms.c                      |   8 +-
 11 files changed, 271 insertions(+), 297 deletions(-)

diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index c8fa181a15d..586363d4f12 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -291,13 +291,105 @@ resize_block_array(const glsl_type *type,
    }
 }
 
-unsigned
+static void
+create_buffer_blocks(void *mem_ctx, struct gl_context *ctx,
+                     struct gl_shader_program *prog,
+                     struct gl_uniform_block **out_blks, unsigned num_blocks,
+                     struct hash_table *block_hash, unsigned num_variables,
+                     bool create_ubo_blocks)
+{
+   if (num_blocks == 0) {
+      assert(num_variables == 0);
+      return;
+   }
+
+   assert(num_variables != 0);
+
+   /* Allocate storage to hold all of the information related to uniform
+    * blocks that can be queried through the API.
+    */
+   struct gl_uniform_block *blocks = ralloc_array(mem_ctx, gl_uniform_block, num_blocks);
+   gl_uniform_buffer_variable *variables =
+      ralloc_array(blocks, gl_uniform_buffer_variable, num_variables);
+
+   /* Add each variable from each uniform block to the API tracking
+    * structures.
+    */
+   ubo_visitor parcel(blocks, variables, num_variables);
+
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_STD140)
+                 == unsigned(ubo_packing_std140));
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_SHARED)
+                 == unsigned(ubo_packing_shared));
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_PACKED)
+                 == unsigned(ubo_packing_packed));
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_STD430)
+                 == unsigned(ubo_packing_std430));
+
+   unsigned i = 0;
+   struct hash_entry *entry;
+   hash_table_foreach (block_hash, entry) {
+      const struct link_uniform_block_active *const b =
+         (const struct link_uniform_block_active *) entry->data;
+      const glsl_type *block_type = b->type;
+
+      if ((create_ubo_blocks && !b->is_shader_storage) ||
+          (!create_ubo_blocks && b->is_shader_storage)) {
+
+         if (b->array != NULL) {
+            unsigned binding_offset = 0;
+            char *name = ralloc_strdup(NULL,
+                                       block_type->without_array()->name);
+            size_t name_length = strlen(name);
+
+            assert(b->has_instance_name);
+            process_block_array(b->array, &name, name_length, blocks, &parcel,
+                                variables, b, &i, &binding_offset, ctx, prog);
+            ralloc_free(name);
+         } else {
+            blocks[i].Name = ralloc_strdup(blocks, block_type->name);
+            blocks[i].Uniforms = &variables[parcel.index];
+            blocks[i].Binding = (b->has_binding) ? b->binding : 0;
+            blocks[i].UniformBufferSize = 0;
+            blocks[i]._Packing =
+               gl_uniform_block_packing(block_type->interface_packing);
+
+            parcel.process(block_type,
+                           b->has_instance_name ? block_type->name : "");
+
+            blocks[i].UniformBufferSize = parcel.buffer_size;
+
+            /* Check SSBO size is lower than maximum supported size for SSBO
+             */
+            if (b->is_shader_storage &&
+                parcel.buffer_size > ctx->Const.MaxShaderStorageBlockSize) {
+               linker_error(prog, "shader storage block `%s' has size %d, "
+                            "which is larger than than the maximum allowed (%d)",
+                            block_type->name, parcel.buffer_size,
+                            ctx->Const.MaxShaderStorageBlockSize);
+            }
+            blocks[i].NumUniforms = (unsigned)(ptrdiff_t)
+               (&variables[parcel.index] - blocks[i].Uniforms);
+            i++;
+         }
+      }
+   }
+
+   *out_blks = blocks;
+
+   assert(parcel.index == num_variables);
+}
+
+void
 link_uniform_blocks(void *mem_ctx,
                     struct gl_context *ctx,
                     struct gl_shader_program *prog,
                     struct gl_shader **shader_list,
                     unsigned num_shaders,
-                    struct gl_uniform_block **blocks_ret)
+                    struct gl_uniform_block **ubo_blocks,
+                    unsigned *num_ubo_blocks,
+                    struct gl_uniform_block **ssbo_blocks,
+                    unsigned *num_ssbo_blocks)
 {
    /* This hash table will track all of the uniform blocks that have been
     * encountered.  Since blocks with the same block-name must be the same,
@@ -310,7 +402,7 @@ link_uniform_blocks(void *mem_ctx,
    if (block_hash == NULL) {
       _mesa_error_no_memory(__func__);
       linker_error(prog, "out of memory\n");
-      return 0;
+      return;
    }
 
    /* Determine which uniform blocks are active.
@@ -323,8 +415,8 @@ link_uniform_blocks(void *mem_ctx,
    /* Count the number of active uniform blocks.  Count the total number of
     * active slots in those uniform blocks.
     */
-   unsigned num_blocks = 0;
-   unsigned num_variables = 0;
+   unsigned num_ubo_variables = 0;
+   unsigned num_ssbo_variables = 0;
    count_block_size block_size;
    struct hash_entry *entry;
 
@@ -346,97 +438,31 @@ link_uniform_blocks(void *mem_ctx,
 
       if (b->array != NULL) {
          unsigned aoa_size = b->type->arrays_of_arrays_size();
-         num_blocks += aoa_size;
-         num_variables += aoa_size * block_size.num_active_uniforms;
-      } else {
-         num_blocks++;
-         num_variables += block_size.num_active_uniforms;
-      }
-
-   }
-
-   if (num_blocks == 0) {
-      assert(num_variables == 0);
-      _mesa_hash_table_destroy(block_hash, NULL);
-      return 0;
-   }
-
-   assert(num_variables != 0);
-
-   /* Allocate storage to hold all of the informatation related to uniform
-    * blocks that can be queried through the API.
-    */
-   gl_uniform_block *blocks =
-      ralloc_array(mem_ctx, gl_uniform_block, num_blocks);
-   gl_uniform_buffer_variable *variables =
-      ralloc_array(blocks, gl_uniform_buffer_variable, num_variables);
-
-   /* Add each variable from each uniform block to the API tracking
-    * structures.
-    */
-   unsigned i = 0;
-   ubo_visitor parcel(blocks, variables, num_variables);
-
-   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_STD140)
-                 == unsigned(ubo_packing_std140));
-   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_SHARED)
-                 == unsigned(ubo_packing_shared));
-   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_PACKED)
-                 == unsigned(ubo_packing_packed));
-   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_STD430)
-                 == unsigned(ubo_packing_std430));
-
-   hash_table_foreach (block_hash, entry) {
-      const struct link_uniform_block_active *const b =
-         (const struct link_uniform_block_active *) entry->data;
-      const glsl_type *block_type = b->type;
-
-      if (b->array != NULL) {
-         unsigned binding_offset = 0;
-         char *name = ralloc_strdup(NULL, block_type->without_array()->name);
-         size_t name_length = strlen(name);
-
-         assert(b->has_instance_name);
-         process_block_array(b->array, &name, name_length, blocks, &parcel,
-                             variables, b, &i, &binding_offset, ctx, prog);
-         ralloc_free(name);
-      } else {
-         blocks[i].Name = ralloc_strdup(blocks, block_type->name);
-         blocks[i].Uniforms = &variables[parcel.index];
-         blocks[i].Binding = (b->has_binding) ? b->binding : 0;
-         blocks[i].UniformBufferSize = 0;
-         blocks[i]._Packing =
-            gl_uniform_block_packing(block_type->interface_packing);
-
-         parcel.process(block_type,
-                        b->has_instance_name ? block_type->name : "");
-
-         blocks[i].UniformBufferSize = parcel.buffer_size;
-
-         /* Check SSBO size is lower than maximum supported size for SSBO */
-         if (b->is_shader_storage &&
-             parcel.buffer_size > ctx->Const.MaxShaderStorageBlockSize) {
-            linker_error(prog, "shader storage block `%s' has size %d, "
-                         "which is larger than than the maximum allowed (%d)",
-                         block_type->name,
-                         parcel.buffer_size,
-                         ctx->Const.MaxShaderStorageBlockSize);
+         if (b->is_shader_storage) {
+            *num_ssbo_blocks += aoa_size;
+            num_ssbo_variables += aoa_size * block_size.num_active_uniforms;
+         } else {
+            *num_ubo_blocks += aoa_size;
+            num_ubo_variables += aoa_size * block_size.num_active_uniforms;
+         }
+      } else {
+         if (b->is_shader_storage) {
+            (*num_ssbo_blocks)++;
+            num_ssbo_variables += block_size.num_active_uniforms;
+         } else {
+            (*num_ubo_blocks)++;
+            num_ubo_variables += block_size.num_active_uniforms;
          }
-         blocks[i].NumUniforms =
-            (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms);
-
-         blocks[i].IsShaderStorage = b->is_shader_storage;
-
-         i++;
       }
+
    }
 
-   assert(parcel.index == num_variables);
+   create_buffer_blocks(mem_ctx, ctx, prog, ubo_blocks, *num_ubo_blocks,
+                        block_hash, num_ubo_variables, true);
+   create_buffer_blocks(mem_ctx, ctx, prog, ssbo_blocks, *num_ssbo_blocks,
+                        block_hash, num_ssbo_variables, false);
 
    _mesa_hash_table_destroy(block_hash, NULL);
-
-   *blocks_ret = blocks;
-   return num_blocks;
 }
 
 bool
diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp
index e5edf2e72e4..c6346d573ab 100644
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -154,11 +154,17 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
 }
 
 void
-set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
+set_block_binding(gl_shader_program *prog, const char *block_name,
+                  unsigned mode, int binding)
 {
-   for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) {
-      if (!strcmp(prog->BufferInterfaceBlocks[i].Name, block_name)) {
-         prog->BufferInterfaceBlocks[i].Binding = binding;
+   unsigned num_blocks = mode == ir_var_uniform ? prog->NumUniformBlocks :
+      prog->NumShaderStorageBlocks;
+   struct gl_uniform_block *blks = mode == ir_var_uniform ?
+      prog->UniformBlocks : prog->ShaderStorageBlocks;
+
+   for (unsigned i = 0; i < num_blocks; i++) {
+      if (!strcmp(blks[i].Name, block_name)) {
+         blks[i].Binding = binding;
          return;
       }
    }
@@ -308,11 +314,12 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
                       *     each subsequent element takes the next consecutive
                       *     uniform block binding point."
                       */
-                     linker::set_block_binding(prog, name,
+                     linker::set_block_binding(prog, name, var->data.mode,
                                                var->data.binding + i);
                   }
                } else {
                   linker::set_block_binding(prog, iface_type->name,
+                                            var->data.mode,
                                             var->data.binding);
                }
             } else if (type->contains_atomic()) {
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 7d8a4b4fb79..8db60a36f16 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -462,7 +462,7 @@ public:
 
       buffer_block_index = -1;
       if (var->is_in_buffer_block()) {
-         struct gl_uniform_block **blks = var->is_in_shader_storage_block() ?
+         struct gl_uniform_block *blks = var->is_in_shader_storage_block() ?
             prog->ShaderStorageBlocks : prog->UniformBlocks;
          unsigned num_blks = var->is_in_shader_storage_block() ?
             prog->NumShaderStorageBlocks : prog->NumUniformBlocks;
@@ -471,15 +471,15 @@ public:
             unsigned l = strlen(var->get_interface_type()->name);
 
             for (unsigned i = 0; i < num_blks; i++) {
-               if (strncmp(var->get_interface_type()->name, blks[i]->Name, l)
-                   == 0 && blks[i]->Name[l] == '[') {
+               if (strncmp(var->get_interface_type()->name, blks[i].Name, l)
+                   == 0 && blks[i].Name[l] == '[') {
                   buffer_block_index = i;
                   break;
                }
             }
          } else {
             for (unsigned i = 0; i < num_blks; i++) {
-               if (strcmp(var->get_interface_type()->name, blks[i]->Name) ==
+               if (strcmp(var->get_interface_type()->name, blks[i].Name) ==
                    0) {
                   buffer_block_index = i;
                   break;
@@ -500,7 +500,7 @@ public:
                     var->get_interface_type()->name);
          } else {
             const struct gl_uniform_block *const block =
-               blks[buffer_block_index];
+               &blks[buffer_block_index];
 
             assert(var->data.location != -1);
 
@@ -960,11 +960,16 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
          sentinel = '[';
       }
 
+      unsigned num_blocks = var->data.mode == ir_var_uniform ?
+         shader->NumUniformBlocks : shader->NumShaderStorageBlocks;
+      struct gl_uniform_block **blks = var->data.mode == ir_var_uniform ?
+         shader->UniformBlocks : shader->ShaderStorageBlocks;
+
       const unsigned l = strlen(var->name);
-      for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
-         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i]->NumUniforms; j++) {
+      for (unsigned i = 0; i < num_blocks; i++) {
+         for (unsigned j = 0; j < blks[i]->NumUniforms; j++) {
             if (sentinel) {
-               const char *begin = shader->BufferInterfaceBlocks[i]->Uniforms[j].Name;
+               const char *begin = blks[i]->Uniforms[j].Name;
                const char *end = strchr(begin, sentinel);
 
                if (end == NULL)
@@ -978,8 +983,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
                   var->data.location = j;
                   break;
                }
-            } else if (!strcmp(var->name,
-                               shader->BufferInterfaceBlocks[i]->Uniforms[j].Name)) {
+            } else if (!strcmp(var->name, blks[i]->Uniforms[j].Name)) {
                found = true;
                var->data.location = j;
                break;
@@ -1104,11 +1108,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       sh->num_uniform_components = uniform_size.num_shader_uniform_components;
       sh->num_combined_uniform_components = sh->num_uniform_components;
 
-      for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
-         if (!sh->BufferInterfaceBlocks[i]->IsShaderStorage) {
-            sh->num_combined_uniform_components +=
-               sh->BufferInterfaceBlocks[i]->UniformBufferSize / 4;
-         }
+      for (unsigned i = 0; i < sh->NumUniformBlocks; i++) {
+         sh->num_combined_uniform_components +=
+            sh->UniformBlocks[i]->UniformBufferSize / 4;
       }
    }
 
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index d9a681ccca1..957efe5b55d 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1165,39 +1165,58 @@ cross_validate_uniforms(struct gl_shader_program *prog)
 }
 
 /**
- * Accumulates the array of prog->BufferInterfaceBlocks and checks that all
- * definitons of blocks agree on their contents.
+ * Accumulates the array of buffer blocks and checks that all definitions of
+ * blocks agree on their contents.
  */
 static bool
-interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
+interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog,
+                                         bool validate_ssbo)
 {
    int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
+   struct gl_uniform_block *blks = NULL;
+   unsigned *num_blks = validate_ssbo ? &prog->NumShaderStorageBlocks :
+      &prog->NumUniformBlocks;
 
-   unsigned max_num_uniform_blocks = 0;
+   unsigned max_num_buffer_blocks = 0;
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (prog->_LinkedShaders[i])
-	 max_num_uniform_blocks += prog->_LinkedShaders[i]->NumBufferInterfaceBlocks;
+      if (prog->_LinkedShaders[i]) {
+         if (validate_ssbo) {
+            max_num_buffer_blocks +=
+               prog->_LinkedShaders[i]->NumShaderStorageBlocks;
+         } else {
+            max_num_buffer_blocks +=
+               prog->_LinkedShaders[i]->NumUniformBlocks;
+         }
+      }
    }
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_shader *sh = prog->_LinkedShaders[i];
 
-      InterfaceBlockStageIndex[i] = new int[max_num_uniform_blocks];
-      for (unsigned int j = 0; j < max_num_uniform_blocks; j++)
+      InterfaceBlockStageIndex[i] = new int[max_num_buffer_blocks];
+      for (unsigned int j = 0; j < max_num_buffer_blocks; j++)
          InterfaceBlockStageIndex[i][j] = -1;
 
       if (sh == NULL)
 	 continue;
 
-      for (unsigned int j = 0; j < sh->NumBufferInterfaceBlocks; j++) {
-	 int index = link_cross_validate_uniform_block(prog,
-						       &prog->BufferInterfaceBlocks,
-						       &prog->NumBufferInterfaceBlocks,
-						       sh->BufferInterfaceBlocks[j]);
+      unsigned sh_num_blocks;
+      struct gl_uniform_block **sh_blks;
+      if (validate_ssbo) {
+         sh_num_blocks = prog->_LinkedShaders[i]->NumShaderStorageBlocks;
+         sh_blks = sh->ShaderStorageBlocks;
+      } else {
+         sh_num_blocks = prog->_LinkedShaders[i]->NumUniformBlocks;
+         sh_blks = sh->UniformBlocks;
+      }
+
+      for (unsigned int j = 0; j < sh_num_blocks; j++) {
+         int index = link_cross_validate_uniform_block(prog, &blks, num_blks,
+                                                       sh_blks[j]);
 
          if (index == -1) {
-            linker_error(prog, "uniform block `%s' has mismatching definitions\n",
-                         sh->BufferInterfaceBlocks[j]->Name);
+            linker_error(prog, "buffer block `%s' has mismatching "
+                         "definitions\n", sh_blks[j]->Name);
 
             for (unsigned k = 0; k <= i; k++) {
                delete[] InterfaceBlockStageIndex[k];
@@ -1213,16 +1232,18 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
     * FIXME: We should be able to free the per stage blocks here.
     */
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
+      for (unsigned j = 0; j < *num_blks; j++) {
          int stage_index = InterfaceBlockStageIndex[i][j];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = prog->_LinkedShaders[i];
 
-            prog->BufferInterfaceBlocks[j].stageref |= (1 << i);
+            blks[j].stageref |= (1 << i);
 
-            sh->BufferInterfaceBlocks[stage_index] =
-               &prog->BufferInterfaceBlocks[j];
+            struct gl_uniform_block **sh_blks = validate_ssbo ?
+               sh->ShaderStorageBlocks : sh->UniformBlocks;
+
+            sh_blks[stage_index] = &blks[j];
 	 }
       }
    }
@@ -1231,6 +1252,11 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
       delete[] InterfaceBlockStageIndex[i];
    }
 
+   if (validate_ssbo)
+      prog->ShaderStorageBlocks = blks;
+   else
+      prog->UniformBlocks = blks;
+
    return true;
 }
 
@@ -2074,7 +2100,10 @@ link_intrastage_shaders(void *mem_ctx,
 			struct gl_shader **shader_list,
 			unsigned num_shaders)
 {
-   struct gl_uniform_block *uniform_blocks = NULL;
+   struct gl_uniform_block *ubo_blocks = NULL;
+   struct gl_uniform_block *ssbo_blocks = NULL;
+   unsigned num_ubo_blocks = 0;
+   unsigned num_ssbo_blocks = 0;
 
    /* Check that global variables defined in multiple shaders are consistent.
     */
@@ -2090,9 +2119,10 @@ link_intrastage_shaders(void *mem_ctx,
       return NULL;
 
    /* Link up uniform blocks defined within this stage. */
-   const unsigned num_uniform_blocks =
-      link_uniform_blocks(mem_ctx, ctx, prog, shader_list, num_shaders,
-                          &uniform_blocks);
+   link_uniform_blocks(mem_ctx, ctx, prog, shader_list, num_shaders,
+                       &ubo_blocks, &num_ubo_blocks, &ssbo_blocks,
+                       &num_ssbo_blocks);
+
    if (!prog->LinkStatus)
       return NULL;
 
@@ -2159,15 +2189,23 @@ link_intrastage_shaders(void *mem_ctx,
    linked->ir = new(linked) exec_list;
    clone_ir_list(mem_ctx, linked->ir, main->ir);
 
-   linked->BufferInterfaceBlocks =
-      ralloc_array(linked, gl_uniform_block *, num_uniform_blocks);
-
-   ralloc_steal(linked, uniform_blocks);
-   for (unsigned i = 0; i < num_uniform_blocks; i++) {
-      linked->BufferInterfaceBlocks[i] = &uniform_blocks[i];
+   /* Copy ubo blocks to linked shader list */
+   linked->UniformBlocks =
+      ralloc_array(linked, gl_uniform_block *, num_ubo_blocks);
+   ralloc_steal(linked, ubo_blocks);
+   for (unsigned i = 0; i < num_ubo_blocks; i++) {
+      linked->UniformBlocks[i] = &ubo_blocks[i];
    }
+   linked->NumUniformBlocks = num_ubo_blocks;
 
-   linked->NumBufferInterfaceBlocks = num_uniform_blocks;
+   /* Copy ssbo blocks to linked shader list */
+   linked->ShaderStorageBlocks =
+      ralloc_array(linked, gl_uniform_block *, num_ssbo_blocks);
+   ralloc_steal(linked, ssbo_blocks);
+   for (unsigned i = 0; i < num_ssbo_blocks; i++) {
+      linked->ShaderStorageBlocks[i] = &ssbo_blocks[i];
+   }
+   linked->NumShaderStorageBlocks = num_ssbo_blocks;
 
    link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
@@ -2973,21 +3011,22 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
                    ctx->Const.MaxCombinedShaderStorageBlocks);
    }
 
-   for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) {
-      /* Don't check SSBOs for Uniform Block Size */
-      if (!prog->BufferInterfaceBlocks[i].IsShaderStorage &&
-          prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) {
+   for (unsigned i = 0; i < prog->NumUniformBlocks; i++) {
+      if (prog->UniformBlocks[i].UniformBufferSize >
+          ctx->Const.MaxUniformBlockSize) {
          linker_error(prog, "Uniform block %s too big (%d/%d)\n",
-                      prog->BufferInterfaceBlocks[i].Name,
-                      prog->BufferInterfaceBlocks[i].UniformBufferSize,
+                      prog->UniformBlocks[i].Name,
+                      prog->UniformBlocks[i].UniformBufferSize,
                       ctx->Const.MaxUniformBlockSize);
       }
+   }
 
-      if (prog->BufferInterfaceBlocks[i].IsShaderStorage &&
-          prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) {
+   for (unsigned i = 0; i < prog->NumShaderStorageBlocks; i++) {
+      if (prog->ShaderStorageBlocks[i].UniformBufferSize >
+          ctx->Const.MaxShaderStorageBlockSize) {
          linker_error(prog, "Shader storage block %s too big (%d/%d)\n",
-                      prog->BufferInterfaceBlocks[i].Name,
-                      prog->BufferInterfaceBlocks[i].UniformBufferSize,
+                      prog->ShaderStorageBlocks[i].Name,
+                      prog->ShaderStorageBlocks[i].UniformBufferSize,
                       ctx->Const.MaxShaderStorageBlockSize);
       }
    }
@@ -3295,8 +3334,8 @@ should_add_buffer_variable(struct gl_shader_program *shProg,
    if (type != GL_BUFFER_VARIABLE)
       return true;
 
-   for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-      const char *block_name = shProg->BufferInterfaceBlocks[i].Name;
+   for (unsigned i = 0; i < shProg->NumShaderStorageBlocks; i++) {
+      const char *block_name = shProg->ShaderStorageBlocks[i].Name;
       block_name_len = strlen(block_name);
 
       const char *block_square_bracket = strchr(block_name, '[');
@@ -3805,8 +3844,8 @@ calculate_array_size_and_stride(struct gl_shader_program *shProg,
    char *var_name = get_top_level_name(uni->name);
    char *interface_name =
       get_top_level_name(uni->is_shader_storage ?
-                         shProg->ShaderStorageBlocks[block_index]->Name :
-                         shProg->UniformBlocks[block_index]->Name);
+                         shProg->ShaderStorageBlocks[block_index].Name :
+                         shProg->UniformBlocks[block_index].Name);
 
    if (strcmp(var_name, interface_name) == 0) {
       /* Deal with instanced array of SSBOs */
@@ -3947,8 +3986,8 @@ build_program_resource_list(struct gl_context *ctx,
       int block_index = shProg->UniformStorage[i].block_index;
       if (block_index != -1) {
          stageref |= is_shader_storage ?
-            shProg->ShaderStorageBlocks[block_index]->stageref :
-            shProg->UniformBlocks[block_index]->stageref;
+            shProg->ShaderStorageBlocks[block_index].stageref :
+            shProg->UniformBlocks[block_index].stageref;
       }
 
       GLenum type = is_shader_storage ? GL_BUFFER_VARIABLE : GL_UNIFORM;
@@ -3965,12 +4004,17 @@ build_program_resource_list(struct gl_context *ctx,
          return;
    }
 
-   /* Add program uniform blocks and shader storage blocks. */
-   for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-      bool is_shader_storage = shProg->BufferInterfaceBlocks[i].IsShaderStorage;
-      GLenum type = is_shader_storage ? GL_SHADER_STORAGE_BLOCK : GL_UNIFORM_BLOCK;
-      if (!add_program_resource(shProg, type,
-          &shProg->BufferInterfaceBlocks[i], 0))
+   /* Add program uniform blocks. */
+   for (unsigned i = 0; i < shProg->NumUniformBlocks; i++) {
+      if (!add_program_resource(shProg, GL_UNIFORM_BLOCK,
+          &shProg->UniformBlocks[i], 0))
+         return;
+   }
+
+   /* Add program shader storage blocks. */
+   for (unsigned i = 0; i < shProg->NumShaderStorageBlocks; i++) {
+      if (!add_program_resource(shProg, GL_SHADER_STORAGE_BLOCK,
+          &shProg->ShaderStorageBlocks[i], 0))
          return;
    }
 
@@ -4115,49 +4159,6 @@ link_assign_subroutine_types(struct gl_shader_program *prog)
    }
 }
 
-static void
-split_ubos_and_ssbos(void *mem_ctx,
-                     struct gl_uniform_block **s_blks,
-                     struct gl_uniform_block *p_blks,
-                     unsigned num_blocks,
-                     struct gl_uniform_block ***ubos,
-                     unsigned *num_ubos,
-                     struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos)
-{
-   unsigned num_ubo_blocks = 0;
-   unsigned num_ssbo_blocks = 0;
-
-   /* Are we spliting the list of blocks for the shader or the program */
-   bool is_shader = p_blks == NULL;
-
-   for (unsigned i = 0; i < num_blocks; i++) {
-      if (is_shader ? s_blks[i]->IsShaderStorage : p_blks[i].IsShaderStorage)
-         num_ssbo_blocks++;
-      else
-         num_ubo_blocks++;
-   }
-
-   *ubos = ralloc_array(mem_ctx, gl_uniform_block *, num_ubo_blocks);
-   *num_ubos = 0;
-
-   *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
-   *num_ssbos = 0;
-
-   for (unsigned i = 0; i < num_blocks; i++) {
-      struct gl_uniform_block *blk = is_shader ? s_blks[i] : &p_blks[i];
-      if (blk->IsShaderStorage) {
-         (*ssbos)[*num_ssbos] = blk;
-         (*num_ssbos)++;
-      } else {
-         (*ubos)[*num_ubos] = blk;
-         (*num_ubos)++;
-      }
-   }
-
-   assert(*num_ubos + *num_ssbos == num_blocks);
-}
-
 static void
 set_always_active_io(exec_list *ir, ir_variable_mode io_mode)
 {
@@ -4498,7 +4499,12 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    if (prog->SeparateShader)
       disable_varying_optimizations_for_sso(prog);
 
-   if (!interstage_cross_validate_uniform_blocks(prog))
+   /* Process UBOs */
+   if (!interstage_cross_validate_uniform_blocks(prog, false))
+      goto done;
+
+   /* Process SSBOs */
+   if (!interstage_cross_validate_uniform_blocks(prog, true))
       goto done;
 
    /* Do common optimization before assigning storage for attributes,
@@ -4695,33 +4701,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                              has_xfb_qualifiers))
       goto done;
 
-   /* Split BufferInterfaceBlocks into UniformBlocks and ShaderStorageBlocks
-    * for gl_shader_program and gl_shader, so that drivers that need separate
-    * index spaces for each set can have that.
-    */
-   for (unsigned i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++) {
-      if (prog->_LinkedShaders[i] != NULL) {
-         gl_shader *sh = prog->_LinkedShaders[i];
-         split_ubos_and_ssbos(sh,
-                              sh->BufferInterfaceBlocks,
-                              NULL,
-                              sh->NumBufferInterfaceBlocks,
-                              &sh->UniformBlocks,
-                              &sh->NumUniformBlocks,
-                              &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks);
-      }
-   }
-
-   split_ubos_and_ssbos(prog,
-                        NULL,
-                        prog->BufferInterfaceBlocks,
-                        prog->NumBufferInterfaceBlocks,
-                        &prog->UniformBlocks,
-                        &prog->NumUniformBlocks,
-                        &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks);
-
    update_array_sizes(prog);
    link_assign_uniform_locations(prog, ctx->Const.UniformBooleanTrue,
                                  num_explicit_uniform_locs,
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index 97144df8ff7..3a0ec8b35d3 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -53,13 +53,16 @@ extern bool
 link_uniform_blocks_are_compatible(const gl_uniform_block *a,
 				   const gl_uniform_block *b);
 
-extern unsigned
+extern void
 link_uniform_blocks(void *mem_ctx,
                     struct gl_context *ctx,
                     struct gl_shader_program *prog,
                     struct gl_shader **shader_list,
                     unsigned num_shaders,
-                    struct gl_uniform_block **blocks_ret);
+                    struct gl_uniform_block **ubo_blocks,
+                    unsigned *num_ubo_blocks,
+                    struct gl_uniform_block **ssbo_blocks,
+                    unsigned *num_ssbo_blocks);
 
 bool
 validate_intrastage_arrays(struct gl_shader_program *prog,
diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
index 49b4a26dc12..09d7d6e8c26 100644
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -105,10 +105,6 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    ralloc_free(shProg->InfoLog);
    shProg->InfoLog = ralloc_strdup(shProg, "");
 
-   ralloc_free(shProg->BufferInterfaceBlocks);
-   shProg->BufferInterfaceBlocks = NULL;
-   shProg->NumBufferInterfaceBlocks = 0;
-
    ralloc_free(shProg->UniformBlocks);
    shProg->UniformBlocks = NULL;
    shProg->NumUniformBlocks = 0;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index b2060c282f4..dc73278424b 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2295,30 +2295,6 @@ struct gl_shader
     */
    unsigned num_combined_uniform_components;
 
-   /**
-    * This shader's uniform/ssbo block information.
-    *
-    * These fields are only set post-linking.
-    *
-    * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is
-    * useful during the linking process so that we don't have to handle SSBOs
-    * specifically.
-    *
-    * UniformBlocks is a list of UBOs. This is useful for backends that need
-    * or prefer to see separate index spaces for UBOS and SSBOs like the GL
-    * API specifies.
-    *
-    * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that
-    * need or prefer to see separate index spaces for UBOS and SSBOs like the
-    * GL API specifies.
-    *
-    * UniformBlocks and ShaderStorageBlocks only have pointers into
-    * BufferInterfaceBlocks so the actual resource information is not
-    * duplicated.
-    */
-   unsigned NumBufferInterfaceBlocks;
-   struct gl_uniform_block **BufferInterfaceBlocks;
-
    unsigned NumUniformBlocks;
    struct gl_uniform_block **UniformBlocks;
 
@@ -2804,33 +2780,11 @@ struct gl_shader_program
     */
    unsigned LastClipDistanceArraySize;
 
-   /**
-    * This shader's uniform/ssbo block information.
-    *
-    * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is
-    * useful during the linking process so that we don't have to handle SSBOs
-    * specifically.
-    *
-    * UniformBlocks is a list of UBOs. This is useful for backends that need
-    * or prefer to see separate index spaces for UBOS and SSBOs like the GL
-    * API specifies.
-    *
-    * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that
-    * need or prefer to see separate index spaces for UBOS and SSBOs like the
-    * GL API specifies.
-    *
-    * UniformBlocks and ShaderStorageBlocks only have pointers into
-    * BufferInterfaceBlocks so the actual resource information is not
-    * duplicated and are only set after linking.
-    */
-   unsigned NumBufferInterfaceBlocks;
-   struct gl_uniform_block *BufferInterfaceBlocks;
-
    unsigned NumUniformBlocks;
-   struct gl_uniform_block **UniformBlocks;
+   struct gl_uniform_block *UniformBlocks;
 
    unsigned NumShaderStorageBlocks;
-   struct gl_uniform_block **ShaderStorageBlocks;
+   struct gl_uniform_block *ShaderStorageBlocks;
 
    /**
     * Map of active uniform names to locations
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 4ef6a81204e..2c1a6ee3505 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -925,8 +925,11 @@ is_resource_referenced(struct gl_shader_program *shProg,
    if (res->Type == GL_ATOMIC_COUNTER_BUFFER)
       return RESOURCE_ATC(res)->StageReferences[stage];
 
-   if (res->Type == GL_UNIFORM_BLOCK || res->Type == GL_SHADER_STORAGE_BLOCK)
-      return shProg->BufferInterfaceBlocks[index].stageref & (1 << stage);
+   if (res->Type == GL_UNIFORM_BLOCK)
+      return shProg->UniformBlocks[index].stageref & (1 << stage);
+
+   if (res->Type == GL_SHADER_STORAGE_BLOCK)
+      return shProg->ShaderStorageBlocks[index].stageref & (1 << stage);
 
    return res->StageReferences & (1 << stage);
 }
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index ba2607221d9..b28b5ce5457 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -727,7 +727,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
       for (i = 0; i < shProg->NumUniformBlocks; i++) {
 	 /* Add one for the terminating NUL character.
 	  */
-	 const GLint len = strlen(shProg->UniformBlocks[i]->Name) + 1;
+	 const GLint len = strlen(shProg->UniformBlocks[i].Name) + 1;
 
 	 if (len > max_len)
 	    max_len = len;
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index 8b9166ceecb..274cb129b07 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -292,9 +292,13 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    ralloc_free(shProg->InfoLog);
    shProg->InfoLog = ralloc_strdup(shProg, "");
 
-   ralloc_free(shProg->BufferInterfaceBlocks);
-   shProg->BufferInterfaceBlocks = NULL;
-   shProg->NumBufferInterfaceBlocks = 0;
+   ralloc_free(shProg->UniformBlocks);
+   shProg->UniformBlocks = NULL;
+   shProg->NumUniformBlocks = 0;
+
+   ralloc_free(shProg->ShaderStorageBlocks);
+   shProg->ShaderStorageBlocks = NULL;
+   shProg->NumShaderStorageBlocks = 0;
 
    ralloc_free(shProg->AtomicBuffers);
    shProg->AtomicBuffers = NULL;
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 7dcbdccf442..a9308d09f69 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1016,13 +1016,13 @@ _mesa_UniformBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
+   if (shProg->UniformBlocks[uniformBlockIndex].Binding !=
        uniformBlockBinding) {
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
 
-      shProg->UniformBlocks[uniformBlockIndex]->Binding = uniformBlockBinding;
+      shProg->UniformBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
    }
 }
 
@@ -1059,13 +1059,13 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
       return;
    }
 
-   if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
+   if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex].Binding !=
        shaderStorageBlockBinding) {
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding =
+      shProg->ShaderStorageBlocks[shaderStorageBlockIndex].Binding =
          shaderStorageBlockBinding;
    }
 }

From 7ef57aa685012102b6f4d6235846f452efce4198 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sun, 3 Apr 2016 15:14:14 +1000
Subject: [PATCH 24/72] mesa: remove unused IsShaderStorage field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/compiler/glsl/link_uniform_blocks.cpp | 1 -
 src/mesa/main/mtypes.h                    | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index 586363d4f12..48fc63fa926 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -261,7 +261,6 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
       }
       blocks[i].NumUniforms =
          (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms);
-      blocks[i].IsShaderStorage = b->is_shader_storage;
 
       *block_index = *block_index + 1;
       *binding_offset = *binding_offset + 1;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index dc73278424b..36c6e201aae 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2500,11 +2500,6 @@ struct gl_uniform_block
     */
    GLuint UniformBufferSize;
 
-   /**
-    * Is this actually an interface block for a shader storage buffer?
-    */
-   bool IsShaderStorage;
-
    /** Stages that reference this block */
    uint8_t stageref;
 

From 5d39f0380639c80731ed0df2bcfd04f0095d8481 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sun, 3 Apr 2016 18:33:40 +1000
Subject: [PATCH 25/72] glsl: remove remaining tabs in link_uniform_blocks.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/compiler/glsl/link_uniform_blocks.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index 48fc63fa926..58f22fd61c5 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -216,7 +216,7 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
 {
    if (ub_array) {
       for (unsigned j = 0; j < ub_array->num_array_elements; j++) {
-	 size_t new_length = name_length;
+         size_t new_length = name_length;
 
          /* Append the subscript to the current variable name */
          ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]",
@@ -466,7 +466,7 @@ link_uniform_blocks(void *mem_ctx,
 
 bool
 link_uniform_blocks_are_compatible(const gl_uniform_block *a,
-				   const gl_uniform_block *b)
+                                   const gl_uniform_block *b)
 {
    assert(strcmp(a->Name, b->Name) == 0);
 
@@ -489,13 +489,13 @@ link_uniform_blocks_are_compatible(const gl_uniform_block *a,
 
    for (unsigned i = 0; i < a->NumUniforms; i++) {
       if (strcmp(a->Uniforms[i].Name, b->Uniforms[i].Name) != 0)
-	 return false;
+         return false;
 
       if (a->Uniforms[i].Type != b->Uniforms[i].Type)
-	 return false;
+         return false;
 
       if (a->Uniforms[i].RowMajor != b->Uniforms[i].RowMajor)
-	 return false;
+         return false;
    }
 
    return true;

From 2e123e1a25182c2e43bf8de1b7911fa6ffc00ae1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 4 Apr 2016 11:54:22 -0400
Subject: [PATCH 26/72] glsl: use has_shader_storage_buffer_objects helper

Replaces open-coded logic with existing helper.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/glsl/lower_ubo_reference.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/compiler/glsl/lower_ubo_reference.cpp b/src/compiler/glsl/lower_ubo_reference.cpp
index 3155ab6225e..1a0140fad15 100644
--- a/src/compiler/glsl/lower_ubo_reference.cpp
+++ b/src/compiler/glsl/lower_ubo_reference.cpp
@@ -372,8 +372,7 @@ lower_ubo_reference_visitor::ubo_load(void *mem_ctx,
 static bool
 shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
 {
-   return state->ARB_shader_storage_buffer_object_enable ||
-      state->is_version(430, 310);
+   return state->has_shader_storage_buffer_objects();
 }
 
 uint32_t

From 0293d72fa56b2cae664a5727abceddb8046d2e92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 31 Mar 2016 23:32:53 +0200
Subject: [PATCH 27/72] drirc: add a workaround for blackness in Warsow

Cc: 11.1 11.2 <mesa-stable@lists.freedesktop.org>
---
 src/mesa/drivers/dri/common/drirc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index 183a1dcabe8..3912d8b8c7e 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -88,5 +88,13 @@ TODO: document the other workarounds.
         <application name="Second Life" executable="do-not-directly-run-secondlife-bin">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
         </application>
+
+        <application name="Warsow (32-bit)" executable="warsow.i386">
+            <option name="allow_glsl_extension_directive_midshader" value="true" />
+        </application>
+
+        <application name="Warsow (64-bit)" executable="warsow.x86_64">
+            <option name="allow_glsl_extension_directive_midshader" value="true" />
+        </application>
     </device>
 </driconf>

From 339335811580c522d6ff66878bc40e662739c47b Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Thu, 31 Mar 2016 11:58:26 +0200
Subject: [PATCH 28/72] radeonsi: set shader calling conventions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note that old mesa + new LLVM or new mesa + old LLVM breaks
with this change and the corresponding LLVM change (D18559).

For LLVM version <= 3.8 we use the old method, but we can't detect
people using a post 3.8 svn version that is still too old.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeon/radeon_llvm_emit.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 474154e52ff..71741325af0 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -55,6 +55,13 @@ enum radeon_llvm_shader_type {
 	RADEON_LLVM_SHADER_CS = 3,
 };
 
+enum radeon_llvm_calling_convention {
+	RADEON_LLVM_AMDGPU_VS = 87,
+	RADEON_LLVM_AMDGPU_GS = 88,
+	RADEON_LLVM_AMDGPU_PS = 89,
+	RADEON_LLVM_AMDGPU_CS = 90,
+};
+
 void radeon_llvm_add_attribute(LLVMValueRef F, const char *name, int value)
 {
 	char str[16];
@@ -71,27 +78,35 @@ void radeon_llvm_add_attribute(LLVMValueRef F, const char *name, int value)
 void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 {
 	enum radeon_llvm_shader_type llvm_type;
+	enum radeon_llvm_calling_convention calling_conv;
 
 	switch (type) {
 	case TGSI_PROCESSOR_VERTEX:
 	case TGSI_PROCESSOR_TESS_CTRL:
 	case TGSI_PROCESSOR_TESS_EVAL:
 		llvm_type = RADEON_LLVM_SHADER_VS;
+		calling_conv = RADEON_LLVM_AMDGPU_VS;
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
 		llvm_type = RADEON_LLVM_SHADER_GS;
+		calling_conv = RADEON_LLVM_AMDGPU_GS;
 		break;
 	case TGSI_PROCESSOR_FRAGMENT:
 		llvm_type = RADEON_LLVM_SHADER_PS;
+		calling_conv = RADEON_LLVM_AMDGPU_PS;
 		break;
 	case TGSI_PROCESSOR_COMPUTE:
 		llvm_type = RADEON_LLVM_SHADER_CS;
+		calling_conv = RADEON_LLVM_AMDGPU_CS;
 		break;
 	default:
 		assert(0);
 	}
 
-	radeon_llvm_add_attribute(F, "ShaderType", llvm_type);
+	if (HAVE_LLVM >= 0x309)
+		LLVMSetFunctionCallConv(F, calling_conv);
+	else
+		radeon_llvm_add_attribute(F, "ShaderType", llvm_type);
 }
 
 static void init_r600_target()

From 061969f9dd0dcaa2cd0c412fedc6ef159dcaf8b0 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 21 Oct 2015 20:40:28 -0700
Subject: [PATCH 29/72] i965: Move get_hw_prim_for_gl_prim to brw_util.c

It's used by brw_compile_gs in brw_vec4_gs_visitor.cpp so it needs to be in
a file that's linked into libi965_compiler.la.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_draw.c | 29 ----------------------------
 src/mesa/drivers/dri/i965/brw_util.c | 28 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index c295d91223c..afa8a4e9eae 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -54,23 +54,6 @@
 
 #define FILE_DEBUG_FLAG DEBUG_PRIMS
 
-static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
-   [GL_POINTS] =_3DPRIM_POINTLIST,
-   [GL_LINES] = _3DPRIM_LINELIST,
-   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
-   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
-   [GL_TRIANGLES] = _3DPRIM_TRILIST,
-   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
-   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
-   [GL_QUADS] = _3DPRIM_QUADLIST,
-   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
-   [GL_POLYGON] = _3DPRIM_POLYGON,
-   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
-   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
-   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
-   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
-};
-
 
 static const GLenum reduced_prim[GL_POLYGON+1] = {
    [GL_POINTS] = GL_POINTS,
@@ -85,18 +68,6 @@ static const GLenum reduced_prim[GL_POLYGON+1] = {
    [GL_POLYGON] = GL_TRIANGLES
 };
 
-uint32_t
-get_hw_prim_for_gl_prim(int mode)
-{
-   if (mode >= BRW_PRIM_OFFSET)
-      return mode - BRW_PRIM_OFFSET;
-   else {
-      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
-      return prim_to_hw_prim[mode];
-   }
-}
-
-
 /* When the primitive changes, set a state bit and re-validate.  Not
  * the nicest and would rather deal with this by having all the
  * programs be immune to the active primitive (ie. cope with all
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index bf7f9c61c84..934b6b8d627 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -98,3 +98,31 @@ GLuint brw_translate_blend_factor( GLenum factor )
       unreachable("not reached");
    }
 }
+
+static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
+   [GL_POINTS] =_3DPRIM_POINTLIST,
+   [GL_LINES] = _3DPRIM_LINELIST,
+   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
+   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
+   [GL_TRIANGLES] = _3DPRIM_TRILIST,
+   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+   [GL_QUADS] = _3DPRIM_QUADLIST,
+   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+   [GL_POLYGON] = _3DPRIM_POLYGON,
+   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+uint32_t
+get_hw_prim_for_gl_prim(int mode)
+{
+   if (mode >= BRW_PRIM_OFFSET)
+      return mode - BRW_PRIM_OFFSET;
+   else {
+      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
+      return prim_to_hw_prim[mode];
+   }
+}

From 3921b64e63db39a3f19ebb8250081ba7ddf843a2 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 4 Apr 2016 14:38:42 -0700
Subject: [PATCH 30/72] i965/fs: Make the repclear shader support either a
 uniform or a flat input

In the Vulkan driver we use a single flat input instead of a uniform
because setting up push constants is more disruptive to the pipeline than
setting up another vertex input.  This uses the number of uniforms as a key
to keep it working for the GL driver.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 1a6a229e444..3e93129b031 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2787,10 +2787,21 @@ fs_visitor::emit_repclear_shader()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    int base_mrf = 1;
    int color_mrf = base_mrf + 2;
+   fs_inst *mov;
 
-   fs_inst *mov = bld.exec_all().group(4, 0)
-                     .MOV(brw_message_reg(color_mrf),
-                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   if (uniforms > 0) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
 
    fs_inst *write;
    if (key->nr_color_regions == 1) {
@@ -2819,8 +2830,10 @@ fs_visitor::emit_repclear_shader()
    assign_curb_setup();
 
    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   assert(mov->src[0].file == FIXED_GRF);
-   mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   if (uniforms > 0) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
 }
 
 /**

From a241ab43b5599f29c43e143bbcaaffef2af3e982 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 18 Jan 2016 17:30:59 -0800
Subject: [PATCH 31/72] i965/fs_surface_builder: Mask signed integers after
 conversion

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 .../dri/i965/brw_fs_surface_builder.cpp        | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 75734d2cfa0..96731ffac7f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -717,6 +717,15 @@ namespace {
                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
                                   BRW_CONDITIONAL_GE);
+
+               /* Mask off all but the bits we actually want.  Otherwise, if
+                * we pass a negative number into the hardware when it's
+                * expecting something like UINT8, it will happily clamp it to
+                * +255 for us.
+                */
+               if (is_signed && widths[c] < 32)
+                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_d(scale(widths[c])));
             }
          }
 
@@ -787,6 +796,15 @@ namespace {
                /* Convert to integer. */
                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Mask off all but the bits we actually want.  Otherwise, if
+                * we pass a negative number into the hardware when it's
+                * expecting something like UINT8, it will happily clamp it to
+                * +255 for us.
+                */
+               if (is_signed && widths[c] < 32)
+                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_d(scale(widths[c])));
             }
          }
 

From 5c5a9b7bf6bc912143f6fa9088c1e6ec415ff3bb Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 4 Apr 2016 14:50:03 -0700
Subject: [PATCH 32/72] brw/device_info: Add a helper for getting a device name

This is needed by the Vulkan driver

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_device_info.c | 12 ++++++++++++
 src/mesa/drivers/dri/i965/brw_device_info.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index c703fb5d4cf..3666190fc36 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -482,3 +482,15 @@ brw_get_device_info(int devid)
 
    return devinfo;
 }
+
+const char *
+brw_get_device_name(int devid)
+{
+   switch (devid) {
+#undef CHIPSET
+#define CHIPSET(id, family, name) case id: return name;
+#include "pci_ids/i965_pci_ids.h"
+   default:
+      return NULL;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index c641ffc281e..4e7f3135960 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -144,3 +144,4 @@ struct brw_device_info
 };
 
 const struct brw_device_info *brw_get_device_info(int devid);
+const char *brw_get_device_name(int devid);

From e61cc87c757f8bc0b6a3af318a512b22c072595c Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 5 Apr 2016 18:19:34 -0700
Subject: [PATCH 33/72] i965/fs: Add a flat_inputs field to prog_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.h |  6 +++++
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 31 ++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 231e0001d54..a42583bb477 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -402,6 +402,12 @@ struct brw_wm_prog_data {
     */
    uint32_t barycentric_interp_modes;
 
+   /**
+    * Mask of which FS inputs are marked flat by the shader source.  This is
+    * needed for setting up 3DSTATE_SF/SBE.
+    */
+   uint32_t flat_inputs;
+
    /**
     * Map from gl_varying_slot to the position within the FS setup data
     * payload where the varying's attribute vertex deltas should be delivered.
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 3e93129b031..954f7823558 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5578,6 +5578,31 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
    return barycentric_interp_modes;
 }
 
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        bool shade_model_flat, const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      enum glsl_interp_qualifier interp_qualifier =
+         (enum glsl_interp_qualifier)var->data.interpolation;
+      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+                         (var->data.location == VARYING_SLOT_COL1);
+
+      int input_index = prog_data->urb_setup[var->data.location];
+
+      if (input_index < 0)
+	 continue;
+
+      /* flat shading */
+      if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+          (shade_model_flat && is_gl_Color &&
+           interp_qualifier == INTERP_QUALIFIER_NONE))
+         prog_data->flat_inputs |= (1 << input_index);
+   }
+}
+
 static uint8_t
 computed_depth_mode(const nir_shader *shader)
 {
@@ -5662,6 +5687,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
       }
    }
 
+   /* We have to compute the flat inputs after the visitor is finished running
+    * because it relies on prog_data->urb_setup which is computed in
+    * fs_visitor::calculate_urb_setup().
+    */
+   brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
    cfg_t *simd8_cfg;
    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
    if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {

From c62db279b6bc4e820345f468c00d4fd65be8556b Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 5 Apr 2016 18:23:36 -0700
Subject: [PATCH 34/72] i965/sf_state: Pull flat_enables out of prog_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, we were walking over the shader source to figure out which
inputs should be marked flat.  Now, we can just pull it out of prog_data.
This is needed for properly setting up 3DSTATE_SF/SBE for Vulkan and it
also means that it will get properly cached.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_state.h     |  1 -
 src/mesa/drivers/dri/i965/gen6_sf_state.c | 21 ++-------------------
 src/mesa/drivers/dri/i965/gen7_sf_state.c |  6 ++----
 src/mesa/drivers/dri/i965/gen8_sf_state.c |  4 +---
 4 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 783af78479e..2dc0a0da45b 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -368,7 +368,6 @@ void
 calculate_attr_overrides(const struct brw_context *brw,
                          uint16_t *attr_overrides,
                          uint32_t *point_sprite_enables,
-                         uint32_t *flat_enables,
                          uint32_t *urb_entry_read_length,
                          uint32_t *urb_entry_read_offset);
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 42f9a5ca8b6..4fdcb8d80e5 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -155,14 +155,12 @@ void
 calculate_attr_overrides(const struct brw_context *brw,
                          uint16_t *attr_overrides,
                          uint32_t *point_sprite_enables,
-                         uint32_t *flat_enables,
                          uint32_t *urb_entry_read_length,
                          uint32_t *urb_entry_read_offset)
 {
    uint32_t max_source_attr = 0;
 
    *point_sprite_enables = 0;
-   *flat_enables = 0;
 
    *urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
 
@@ -180,9 +178,6 @@ calculate_attr_overrides(const struct brw_context *brw,
 
    *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
 
-   /* _NEW_LIGHT */
-   bool shade_model_flat = brw->ctx.Light.ShadeModel == GL_FLAT;
-
    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
     * description of dw10 Point Sprite Texture Coordinate Enable:
     *
@@ -208,10 +203,6 @@ calculate_attr_overrides(const struct brw_context *brw,
    memset(attr_overrides, 0, 16*sizeof(*attr_overrides));
 
    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      /* BRW_NEW_FRAGMENT_PROGRAM */
-      enum glsl_interp_qualifier interp_qualifier =
-         brw->fragment_program->InterpQualifier[attr];
-      bool is_gl_Color = attr == VARYING_SLOT_COL0 || attr == VARYING_SLOT_COL1;
       /* BRW_NEW_FS_PROG_DATA */
       int input_index = brw->wm.prog_data->urb_setup[attr];
 
@@ -234,12 +225,6 @@ calculate_attr_overrides(const struct brw_context *brw,
             *point_sprite_enables |= (1 << input_index);
       }
 
-      /* flat shading */
-      if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
-          (shade_model_flat && is_gl_Color &&
-           interp_qualifier == INTERP_QUALIFIER_NONE))
-         *flat_enables |= (1 << input_index);
-
       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
       uint16_t attr_override = point_sprite ? 0 :
          get_attr_override(&brw->vue_map_geom_out,
@@ -285,7 +270,6 @@ upload_sf_state(struct brw_context *brw)
    uint32_t num_outputs = brw->wm.prog_data->num_varying_inputs;
    uint32_t dw1, dw2, dw3, dw4;
    uint32_t point_sprite_enables;
-   uint32_t flat_enables;
    int i;
    /* _NEW_BUFFER */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
@@ -428,8 +412,7 @@ upload_sf_state(struct brw_context *brw)
    uint32_t urb_entry_read_length;
    uint32_t urb_entry_read_offset;
    calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables,
-                            &flat_enables, &urb_entry_read_length,
-                            &urb_entry_read_offset);
+                            &urb_entry_read_length, &urb_entry_read_offset);
    dw1 |= (urb_entry_read_length << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
            urb_entry_read_offset << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT);
 
@@ -446,7 +429,7 @@ upload_sf_state(struct brw_context *brw)
       OUT_BATCH(attr_overrides[i * 2] | attr_overrides[i * 2 + 1] << 16);
    }
    OUT_BATCH(point_sprite_enables); /* dw16 */
-   OUT_BATCH(flat_enables);
+   OUT_BATCH(brw->wm.prog_data->flat_inputs);
    OUT_BATCH(0); /* wrapshortest enables 0-7 */
    OUT_BATCH(0); /* wrapshortest enables 8-15 */
    ADVANCE_BATCH();
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 7c98c73edf8..c76789fa252 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -38,7 +38,6 @@ upload_sbe_state(struct brw_context *brw)
    uint32_t num_outputs = brw->wm.prog_data->num_varying_inputs;
    uint32_t dw1;
    uint32_t point_sprite_enables;
-   uint32_t flat_enables;
    int i;
    uint16_t attr_overrides[16];
    /* _NEW_BUFFERS */
@@ -66,8 +65,7 @@ upload_sbe_state(struct brw_context *brw)
    uint32_t urb_entry_read_length;
    uint32_t urb_entry_read_offset;
    calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables,
-                            &flat_enables, &urb_entry_read_length,
-                            &urb_entry_read_offset);
+                            &urb_entry_read_length, &urb_entry_read_offset);
    dw1 |= urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
           urb_entry_read_offset << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT;
 
@@ -81,7 +79,7 @@ upload_sbe_state(struct brw_context *brw)
    }
 
    OUT_BATCH(point_sprite_enables); /* dw10 */
-   OUT_BATCH(flat_enables);
+   OUT_BATCH(brw->wm.prog_data->flat_inputs);
    OUT_BATCH(0); /* wrapshortest enables 0-7 */
    OUT_BATCH(0); /* wrapshortest enables 8-15 */
    ADVANCE_BATCH();
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 2ac21f7c873..5a97c1d0e90 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -39,7 +39,6 @@ upload_sbe(struct brw_context *brw)
    uint32_t urb_entry_read_length;
    uint32_t urb_entry_read_offset;
    uint32_t point_sprite_enables;
-   uint32_t flat_enables;
    int sbe_cmd_length;
 
    uint32_t dw1 =
@@ -66,7 +65,6 @@ upload_sbe(struct brw_context *brw)
     */
    calculate_attr_overrides(brw, attr_overrides,
                             &point_sprite_enables,
-                            &flat_enables,
                             &urb_entry_read_length,
                             &urb_entry_read_offset);
 
@@ -109,7 +107,7 @@ upload_sbe(struct brw_context *brw)
    OUT_BATCH(_3DSTATE_SBE << 16 | (sbe_cmd_length - 2));
    OUT_BATCH(dw1);
    OUT_BATCH(point_sprite_enables);
-   OUT_BATCH(flat_enables);
+   OUT_BATCH(brw->wm.prog_data->flat_inputs);
    if (sbe_cmd_length >= 6) {
       OUT_BATCH(dw4);
       OUT_BATCH(dw5);

From b40375a21cd8c0336aa9c3fcbabe02c27f9d1471 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Wed, 17 Feb 2016 19:15:49 +1100
Subject: [PATCH 35/72] mesa: Add comment to framebuffer_parameteri()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

V.2:
 Change 'N.B.,' to 'NOTE:'.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/fbobject.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index bb8d4c3112b..c81f5a083eb 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1369,6 +1369,11 @@ _mesa_BindRenderbufferEXT(GLenum target, GLuint renderbuffer)
    bind_renderbuffer(target, renderbuffer, true);
 }
 
+/**
+ * ARB_framebuffer_no_attachment - Application passes requested param's
+ * here. NOTE: NumSamples requested need not be _NumSamples which is
+ * what the hw supports.
+ */
 static void
 framebuffer_parameteri(struct gl_context *ctx, struct gl_framebuffer *fb,
                        GLenum pname, GLint param, const char *func)

From 85f79f0c7567e47ca4c5b204ddf7891fd12e3e85 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 2 Jan 2016 05:55:49 +1100
Subject: [PATCH 36/72] mesa/st: Use _mesa_geometric_ functions appropriately
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change references to gl_framebuffer::Width, Height, MaxNumLayers
and Visual::samples to use the _mesa_geometric_ convenience functions
for those places where the geometry of the gl_framebuffer is needed.
This is in contrast to the geometry of the intersection of the
attachments of the gl_framebuffer.

This patch paves the way to enable GL_ARB_framebuffer_no_attachements
for all gallium drivers.

V.2:
 Remove itermeditate variable state.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_atom_rasterizer.c | 2 +-
 src/mesa/state_tracker/st_atom_scissor.c    | 8 ++++++--
 src/mesa/state_tracker/st_cb_drawtex.c      | 9 +++++----
 src/mesa/state_tracker/st_cb_msaa.c         | 4 +++-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 366163e42df..ed9deb03327 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -244,7 +244,7 @@ static void update_raster_state( struct st_context *st )
          _mesa_is_multisample_enabled(ctx) &&
          ctx->Multisample.SampleShading &&
          ctx->Multisample.MinSampleShadingValue *
-         ctx->DrawBuffer->Visual.samples > 1;
+         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    /* _NEW_SCISSOR */
    raster->scissor = ctx->Scissor.EnableFlags;
diff --git a/src/mesa/state_tracker/st_atom_scissor.c b/src/mesa/state_tracker/st_atom_scissor.c
index 4ebe799e35d..605d5cba9e7 100644
--- a/src/mesa/state_tracker/st_atom_scissor.c
+++ b/src/mesa/state_tracker/st_atom_scissor.c
@@ -32,6 +32,7 @@
  
 
 #include "main/macros.h"
+#include "main/framebuffer.h"
 #include "st_context.h"
 #include "pipe/p_context.h"
 #include "st_atom.h"
@@ -46,14 +47,17 @@ update_scissor( struct st_context *st )
    struct pipe_scissor_state scissor[PIPE_MAX_VIEWPORTS];
    const struct gl_context *ctx = st->ctx;
    const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const unsigned int fb_width = _mesa_geometric_width(fb);
+   const unsigned int fb_height = _mesa_geometric_height(fb);
    GLint miny, maxy;
    unsigned i;
    bool changed = false;
+
    for (i = 0 ; i < ctx->Const.MaxViewports; i++) {
       scissor[i].minx = 0;
       scissor[i].miny = 0;
-      scissor[i].maxx = fb->Width;
-      scissor[i].maxy = fb->Height;
+      scissor[i].maxx = fb_width;
+      scissor[i].maxy = fb_height;
 
       if (ctx->Scissor.EnableFlags & (1 << i)) {
          /* need to be careful here with xmax or ymax < 0 */
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index a7926295277..e2af2357f02 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -16,6 +16,7 @@
 #include "main/image.h"
 #include "main/macros.h"
 #include "main/teximage.h"
+#include "main/framebuffer.h"
 #include "program/program.h"
 #include "program/prog_print.h"
 
@@ -166,8 +167,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
       /* positions (in clip coords) */
       {
          const struct gl_framebuffer *fb = ctx->DrawBuffer;
-         const GLfloat fb_width = (GLfloat)fb->Width;
-         const GLfloat fb_height = (GLfloat)fb->Height;
+         const GLfloat fb_width = (GLfloat)_mesa_geometric_width(fb);
+         const GLfloat fb_height = (GLfloat)_mesa_geometric_height(fb);
 
          const GLfloat clip_x0 = (GLfloat)(x0 / fb_width * 2.0 - 1.0);
          const GLfloat clip_y0 = (GLfloat)(y0 / fb_height * 2.0 - 1.0);
@@ -262,8 +263,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    {
       const struct gl_framebuffer *fb = ctx->DrawBuffer;
       const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
-      const GLfloat width = (GLfloat)fb->Width;
-      const GLfloat height = (GLfloat)fb->Height;
+      const GLfloat width = (GLfloat)_mesa_geometric_width(fb);
+      const GLfloat height = (GLfloat)_mesa_geometric_height(fb);
       struct pipe_viewport_state vp;
       vp.scale[0] =  0.5f * width;
       vp.scale[1] = height * (invert ? -0.5f : 0.5f);
diff --git a/src/mesa/state_tracker/st_cb_msaa.c b/src/mesa/state_tracker/st_cb_msaa.c
index d581f2121b0..22001e49973 100644
--- a/src/mesa/state_tracker/st_cb_msaa.c
+++ b/src/mesa/state_tracker/st_cb_msaa.c
@@ -27,6 +27,7 @@
 
 #include "main/bufferobj.h"
 #include "main/imports.h"
+#include "main/framebuffer.h"
 
 #include "state_tracker/st_cb_msaa.h"
 #include "state_tracker/st_context.h"
@@ -47,7 +48,8 @@ st_GetSamplePosition(struct gl_context *ctx,
    st_validate_state(st, ST_PIPELINE_RENDER);
 
    if (st->pipe->get_sample_position)
-      st->pipe->get_sample_position(st->pipe, (unsigned) fb->Visual.samples,
+      st->pipe->get_sample_position(st->pipe,
+                                    _mesa_geometric_samples(fb),
                                     index, outPos);
    else
       outPos[0] = outPos[1] = 0.5f;

From 4bc9130fba2f815cb910536d3d3a253a8c3ed0b9 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Wed, 17 Feb 2016 20:59:52 +1100
Subject: [PATCH 37/72] gallium: Add PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add PIPE_CAP to determine if the GL extension
'GL_ARB_framebuffer_no_attachments' shall be
supported.

The driver is required to support 'PIPE_FORMAT_NONE'
via its 'is_format_supported()' callback in order
to determine the MSAA modes the hardware supports so
that values requested from the application using
'GL_ARB_framebuffer_no_attachments' may be quantized
to what the hardware expects.

V.2:
 Fix doc for a more detailed description of the PIPE_CAP
 and the corresponding GL constant.

V.3:
 Renamed and repurposed once again.

V.4:
 Remove CAP from cap_mapping array.

[airlied: fix damaged whitespace]

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/docs/source/screen.rst               | 8 ++++++++
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/drivers/virgl/virgl_screen.c         | 1 +
 src/gallium/include/pipe/p_defines.h             | 1 +
 16 files changed, 23 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 47a19de6ea9..824f580ed44 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -323,6 +323,14 @@ The integer capabilities:
 * ``PIPE_CAP_PCI_BUS``: Return the PCI bus number.
 * ``PIPE_CAP_PCI_DEVICE``: Return the PCI device number.
 * ``PIPE_CAP_PCI_FUNCTION``: Return the PCI function number.
+* ``PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT``:
+  If non-zero, rendering to framebuffers with no surface attachments
+  is supported. The context->is_format_supported function will be expected
+  to be implemented with PIPE_FORMAT_NONE yeilding the MSAA modes the hardware
+  supports. N.B., The maximum number of layers supported for rasterizing a
+  primitive on a layer is obtained from ``PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS``
+  even though it can be larger than the number of layers supported by either
+  rendering or textures.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index d47cb07f10b..707be17513b 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -255,6 +255,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_INVALIDATE_BUFFER:
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index f4aa310ecdc..68e32e51c34 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -269,6 +269,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 7812c826250..142d6f1fa21 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -498,6 +498,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 2529b546564..6a5f906adc6 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -319,6 +319,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index b105c6aeb80..db7c2d15fb1 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -192,6 +192,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index ba5e5003b69..20fb61b51f4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -245,6 +245,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index ec2340ee0c3..c41912a6037 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -251,6 +251,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 1c3bb64f0e4..b3a7f049e10 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -214,6 +214,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
         case PIPE_CAP_QUERY_BUFFER_OBJECT:
         case PIPE_CAP_QUERY_MEMORY_INFO:
+        case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index c97e34121e3..e6eec3bcf70 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -364,6 +364,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
+	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 407b9e19cc4..275b6767be4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -356,6 +356,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
+	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index bfd3598fc57..90f29d6e52a 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -270,6 +270,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index c0873c0c65a..ccf794ecda7 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -404,6 +404,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
    case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 92d910ba6a5..167a2f5bd8e 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -207,6 +207,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_PCI_BUS:
         case PIPE_CAP_PCI_DEVICE:
         case PIPE_CAP_PCI_FUNCTION:
+        case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 8126bdec40c..5a5afc1712f 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -239,6 +239,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 6f30f9ed7d3..5e204a3e5ea 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -690,6 +690,7 @@ enum pipe_cap
    PIPE_CAP_PCI_BUS,
    PIPE_CAP_PCI_DEVICE,
    PIPE_CAP_PCI_FUNCTION,
+   PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From 2016e9ffda26aac6a65c363f38afc047b72d3e83 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Wed, 17 Feb 2016 21:01:57 +1100
Subject: [PATCH 38/72] gallium: Obtain ARB_framebuffer_no_attachment constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set default values for the constants required in
ARB_framebuffer_no_attachments and obtained the number
of layers from ``PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS``.

We also obtain the MaxFramebufferSamples value using
a query back to the driver for PIPE_FORMAT_NONE.

V.1:
 Merge if branch predicates into one branch.
 Move const init into st_init_limits()

[airlied: whitespace fixup]
Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_extensions.c | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 6c0df8d2a98..287894317df 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -445,6 +445,18 @@ void st_init_limits(struct pipe_screen *screen,
       extensions->ARB_shader_image_load_store = GL_TRUE;
       extensions->ARB_shader_image_size = GL_TRUE;
    }
+
+   /* ARB_framebuffer_no_attachments */
+   c->MaxFramebufferWidth   = c->MaxViewportWidth;
+   c->MaxFramebufferHeight  = c->MaxViewportHeight;
+   /* NOTE: we cheat here a little by assuming that
+    * PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS has the same
+    * number of layers as we need, although we technically
+    * could have more the generality is not really useful
+    * in practicality.
+    */
+   c->MaxFramebufferLayers =
+      screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS);
 }
 
 
@@ -956,6 +968,9 @@ void st_init_extensions(struct pipe_screen *screen,
       enum pipe_format int_formats[] = {
          PIPE_FORMAT_R8G8B8A8_SINT
       };
+      enum pipe_format void_formats[] = {
+         PIPE_FORMAT_NONE
+      };
 
       consts->MaxSamples =
          get_max_samples_for_formats(screen, ARRAY_SIZE(color_formats),
@@ -976,6 +991,12 @@ void st_init_extensions(struct pipe_screen *screen,
          get_max_samples_for_formats(screen, ARRAY_SIZE(int_formats),
                                      int_formats, consts->MaxSamples,
                                      PIPE_BIND_SAMPLER_VIEW);
+
+      /* ARB_framebuffer_no_attachments, assume max no. of samples 32 */
+      consts->MaxFramebufferSamples =
+         get_max_samples_for_formats(screen, ARRAY_SIZE(void_formats),
+                                     void_formats, 32,
+                                     PIPE_BIND_RENDER_TARGET);
    }
    if (consts->MaxSamples == 1) {
       /* one sample doesn't really make sense */
@@ -1068,6 +1089,13 @@ void st_init_extensions(struct pipe_screen *screen,
          extensions->AMD_vertex_shader_viewport_index = GL_TRUE;
    }
 
+   /* ARB_framebuffer_no_attachments */
+   if (screen->get_param(screen, PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT) &&
+       ((consts->MaxSamples >= 4 && consts->MaxFramebufferLayers >= 2048) ||
+        (consts->MaxFramebufferSamples >= consts->MaxSamples &&
+         consts->MaxFramebufferLayers >= consts->MaxArrayTextureLayers)))
+      extensions->ARB_framebuffer_no_attachments = GL_TRUE;
+
    /* GL_ARB_ES3_compatibility.
     *
     * Assume that ES3 is supported if GLSL 3.30 is supported.

From b512b5fd3664781d1f9ada1c784353b85bbe0e5b Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Wed, 17 Feb 2016 10:27:41 +1100
Subject: [PATCH 39/72] mesa/st: Set _NumSamples in update_framebuffer_state()

Using PIPE_FORMAT_NONE to indicate what MSAA modes are supported
with a framebuffer using no attachment.

V.2:
 Rewrite MSAA mode loop to be more general.
V.3:
 Move comment to right place after loop was rewritten.
V.4: [airlied]
 remove unneeded variable, and assert, and unneeded pipe assignment

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_atom_framebuffer.c | 46 ++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index ae883a2535e..5c23735f9ac 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -64,6 +64,41 @@ update_framebuffer_size(struct pipe_framebuffer_state *framebuffer,
    framebuffer->height = MIN2(framebuffer->height, surface->height);
 }
 
+/**
+ * Round up the requested multisample count to the next supported sample size.
+ */
+static unsigned
+framebuffer_quantize_num_samples(struct st_context *st, unsigned num_samples)
+{
+   struct pipe_screen *screen = st->pipe->screen;
+   int quantized_samples = 0;
+   unsigned msaa_mode;
+
+   if (!num_samples)
+      return 0;
+
+   /* Assumes the highest supported MSAA is a power of 2 */
+   msaa_mode = util_next_power_of_two(st->ctx->Const.MaxFramebufferSamples);
+   assert(!(num_samples > msaa_mode)); /* be safe from infinite loops */
+
+   /**
+    * Check if the MSAA mode that is higher than the requested
+    * num_samples is supported, and if so returning it.
+    */
+   for (; msaa_mode >= num_samples; msaa_mode = msaa_mode / 2) {
+      /**
+       * For ARB_framebuffer_no_attachment, A format of
+       * PIPE_FORMAT_NONE implies what number of samples is
+       * supported for a framebuffer with no attachment. Thus the
+       * drivers callback must be adjusted for this.
+       */
+      if (screen->is_format_supported(screen, PIPE_FORMAT_NONE,
+                                      PIPE_TEXTURE_2D, msaa_mode,
+                                      PIPE_BIND_RENDER_TARGET))
+         quantized_samples = msaa_mode;
+   }
+   return quantized_samples;
+}
 
 /**
  * Update framebuffer state (color, depth, stencil, etc. buffers)
@@ -82,6 +117,17 @@ update_framebuffer_state( struct st_context *st )
    framebuffer->width  = UINT_MAX;
    framebuffer->height = UINT_MAX;
 
+   /**
+    * Quantize the derived default number of samples:
+    *
+    * A query to the driver of supported MSAA values the
+    * hardware supports is done as to legalize the number
+    * of application requested samples, NumSamples.
+    * See commit eb9cf3c for more information.
+    */
+   fb->DefaultGeometry._NumSamples =
+      framebuffer_quantize_num_samples(st, fb->DefaultGeometry.NumSamples);
+
    /*printf("------ fb size %d x %d\n", fb->Width, fb->Height);*/
 
    /* Examine Mesa's ctx->DrawBuffer->_ColorDrawBuffers state

From 0b7075fed75585087fa54b537fa9866bce2fcbee Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sun, 3 Jan 2016 01:44:55 +1100
Subject: [PATCH 40/72] gallium: Put no.of {samples,layers} into
 pipe_framebuffer_state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here we store the number of samples and layers directly in the
pipe_framebuffer_state so that in the case of
ARB_framebuffer_no_attachment we may make use of them directly.

Further, we adjust various gallium/auxiliary helper functions
accordingly.

V2:
  Convert branches in util_framebuffer_get_num_layers() and
  util_framebuffer_get_num_samples() to their canonical form.

V3:
  'git stash pop' the typo fix of 'cbufs' which should be
  'nr_cbufs' that was missing in V2, woops! Thanks Marek for
  pointing this out yet again.

V4:
  Squash in the following patch:

  'gallium/util: Ensure util_framebuffer_get_num_samples() is valid'

   Upon context creation, internal driver structures are malloc()'ed
   and memset() to zero them. This results in a invalid number of
   samples 'by default'. Handle this in the simplest way to avoid
   elaborate and probably equally sub-optimial solutions.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/util/u_dump_state.c  |  2 ++
 src/gallium/auxiliary/util/u_framebuffer.c | 30 ++++++++++++++++++++++
 src/gallium/include/pipe/p_state.h         |  8 ++++++
 3 files changed, 40 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index a73a1de2f0b..b1f3982fb4e 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -645,6 +645,8 @@ util_dump_framebuffer_state(FILE *stream, const struct pipe_framebuffer_state *s
 
    util_dump_member(stream, uint, state, width);
    util_dump_member(stream, uint, state, height);
+   util_dump_member(stream, uint, state, samples);
+   util_dump_member(stream, uint, state, layers);
    util_dump_member(stream, uint, state, nr_cbufs);
    util_dump_member_array(stream, ptr, state, cbufs);
    util_dump_member(stream, ptr, state, zsbuf);
diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
index 49b391d8162..f9b804673dc 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -55,6 +55,10 @@ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
        dst->height != src->height)
       return FALSE;
 
+   if (dst->samples != src->samples ||
+       dst->layers  != src->layers)
+      return FALSE;
+
    if (dst->nr_cbufs != src->nr_cbufs) {
       return FALSE;
    }
@@ -85,6 +89,9 @@ util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
    dst->width = src->width;
    dst->height = src->height;
 
+   dst->samples = src->samples;
+   dst->layers  = src->layers;
+
    for (i = 0; i < src->nr_cbufs; i++)
       pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
 
@@ -109,6 +116,7 @@ util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb)
 
    pipe_surface_reference(&fb->zsbuf, NULL);
 
+   fb->samples = fb->layers = 0;
    fb->width = fb->height = 0;
    fb->nr_cbufs = 0;
 }
@@ -160,6 +168,14 @@ util_framebuffer_get_num_layers(const struct pipe_framebuffer_state *fb)
 {
 	unsigned i, num_layers = 0;
 
+	/**
+	 * In the case of ARB_framebuffer_no_attachment
+	 * we obtain the number of layers directly from
+	 * the framebuffer state.
+	 */
+	if (!(fb->nr_cbufs || fb->zsbuf))
+		return fb->layers;
+
 	for (i = 0; i < fb->nr_cbufs; i++) {
 		if (fb->cbufs[i]) {
 			unsigned num = fb->cbufs[i]->u.tex.last_layer -
@@ -184,6 +200,20 @@ util_framebuffer_get_num_samples(const struct pipe_framebuffer_state *fb)
 {
    unsigned i;
 
+   /**
+    * In the case of ARB_framebuffer_no_attachment
+    * we obtain the number of samples directly from
+    * the framebuffer state.
+    *
+    * NOTE: fb->samples may wind up as zero due to memset()'s on internal
+    *       driver structures on their initialization and so we take the
+    *       MAX here to ensure we have a valid number of samples. However,
+    *       if samples is legitimately not getting set somewhere
+    *       multi-sampling will evidently break.
+    */
+   if (!(fb->nr_cbufs || fb->zsbuf))
+      return MAX2(fb->samples, 1);
+
    for (i = 0; i < fb->nr_cbufs; i++) {
       if (fb->cbufs[i]) {
          return MAX2(1, fb->cbufs[i]->texture->nr_samples);
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 5ab53728e82..cb806cb6550 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -298,9 +298,17 @@ struct pipe_stencil_ref
 };
 
 
+/**
+ * Note that pipe_surfaces are "texture views for rendering"
+ * and so in the case of ARB_framebuffer_no_attachment there
+ * is no pipe_surface state available such that we may
+ * extract the number of samples and layers.
+ */
 struct pipe_framebuffer_state
 {
    unsigned width, height;
+   unsigned samples; /**< Number of samples in a no-attachment framebuffer */
+   unsigned layers;  /**< Number of layers  in a no-attachment framebuffer */
 
    /** multiple color buffers for multiple render targets */
    unsigned nr_cbufs;

From 7ff28d2af0e6099cecb2c6456765e189785a027c Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Mon, 1 Feb 2016 11:16:06 +1100
Subject: [PATCH 41/72] gallium/trace: Dump no.of samples and layers in fb
 state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/trace/tr_dump_state.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index b53d7dbec2f..e805706f19d 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -481,6 +481,8 @@ void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state)
 
    trace_dump_member(uint, state, width);
    trace_dump_member(uint, state, height);
+   trace_dump_member(uint, state, samples);
+   trace_dump_member(uint, state, layers);
    trace_dump_member(uint, state, nr_cbufs);
    trace_dump_member_array(ptr, state, cbufs);
    trace_dump_member(ptr, state, zsbuf);

From c6a514d7dfde51711399b1c3ffec7b7c7d1bad3b Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sun, 3 Jan 2016 21:08:33 +1100
Subject: [PATCH 42/72] mesa/st: Update framebuffer state with no.of
 samples,layers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Handle the case of ARB_framebuffer_no_attachment.
Also, kill off a dead debug printf() call while we are here.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_atom_framebuffer.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index 5c23735f9ac..ade3d61dc70 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -43,6 +43,7 @@
 #include "util/u_math.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "main/framebuffer.h"
 
 
 /**
@@ -114,8 +115,6 @@ update_framebuffer_state( struct st_context *st )
    st_flush_bitmap_cache(st);
 
    st->state.fb_orientation = st_fb_orientation(fb);
-   framebuffer->width  = UINT_MAX;
-   framebuffer->height = UINT_MAX;
 
    /**
     * Quantize the derived default number of samples:
@@ -128,7 +127,10 @@ update_framebuffer_state( struct st_context *st )
    fb->DefaultGeometry._NumSamples =
       framebuffer_quantize_num_samples(st, fb->DefaultGeometry.NumSamples);
 
-   /*printf("------ fb size %d x %d\n", fb->Width, fb->Height);*/
+   framebuffer->width  = _mesa_geometric_width(fb);
+   framebuffer->height = _mesa_geometric_height(fb);
+   framebuffer->samples = _mesa_geometric_samples(fb);
+   framebuffer->layers = _mesa_geometric_layers(fb);
 
    /* Examine Mesa's ctx->DrawBuffer->_ColorDrawBuffers state
     * to determine which surfaces to draw to

From 63f2b2f2c02fd4685322bc125a81d6d41f73b7c9 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Fri, 12 Feb 2016 21:11:57 +1100
Subject: [PATCH 43/72] softpipe: Set samples and layers in
 set_framebuffer_state() cb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Carries across the number of samples and layers state in the
'softpipe_set_framebuffer_state()' callback. This state is
part of 'ARB_framebuffer_no_attachments' support.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/softpipe/sp_state_surface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index db4b2735d58..1a4bf384b2a 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -94,6 +94,8 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
 
    sp->framebuffer.width = fb->width;
    sp->framebuffer.height = fb->height;
+   sp->framebuffer.samples = fb->samples;
+   sp->framebuffer.layers = fb->layers;
 
    sp->dirty |= SP_NEW_FRAMEBUFFER;
 }

From bb1bd0ddd70706c765053bc84deecac77d9b2349 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sun, 20 Mar 2016 14:50:04 +1100
Subject: [PATCH 44/72] radeonsi: Allow 16 samples MSAA mode for
 PIPE_FORMAT_NONE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For ARB_framebuffer_no_attachment; A is_format_supported() query
with 'PIPE_FORMAT_NONE' passed implies a query of the number of
samples supported from the framebuffer with no attachment.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 10d691a92f1..415b03aac1d 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2000,6 +2000,11 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 		case 4:
 		case 8:
 			break;
+		case 16:
+			if (format == PIPE_FORMAT_NONE)
+				return TRUE;
+			else
+				return FALSE;
 		default:
 			return FALSE;
 		}

From 1156cad405eb380f065086710df114ed40e767b9 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sun, 20 Mar 2016 18:01:06 +1100
Subject: [PATCH 45/72] radeonsi: Improve assert info out of
 si_set_framebuffer_state()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lets give the developer a little hand if we are going to assert
on a zero literal at the end of a branch.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 415b03aac1d..f559c73f065 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2628,6 +2628,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 			constbuf.user_buffer = sctx->b.sample_locations_16x;
 			break;
 		default:
+			R600_ERR("Requested an invalid number of samples %i.\n",
+				 sctx->framebuffer.nr_samples);
 			assert(0);
 		}
 		constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;

From 483a686f809a2c778912a5536049b1df0cf97d9f Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 2 Jan 2016 05:53:57 +1100
Subject: [PATCH 46/72] radeonsi: Enable ARB_framebuffer_no_attachments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 275b6767be4..97be59d2272 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -307,6 +307,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -356,7 +357,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
-	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:

From ea310f2b38c30ff4e0e1ee03fca179a4e97cb10b Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Mon, 1 Feb 2016 12:12:12 +1100
Subject: [PATCH 47/72] r600g: Enable ARB_framebuffer_no_attachments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/r600_pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index e6eec3bcf70..36b808fbbca 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -281,6 +281,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_INVALIDATE_BUFFER:
 	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
+	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 1;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -364,7 +365,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
-	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:

From 6fc3e7c988876f960864acffe64dd90bab5325a6 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Mon, 1 Feb 2016 11:18:15 +1100
Subject: [PATCH 48/72] GL3.txt: Mark ARB_framebuffer_no_attachments as done
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 docs/GL3.txt              | 2 +-
 docs/relnotes/11.3.0.html | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 5b6dc89e250..c48802a9f7b 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -172,7 +172,7 @@ GL 4.3, GLSL 4.30:
   GL_KHR_debug                                          DONE (all drivers)
   GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
   GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
-  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965, r600, radeonsi)
   GL_ARB_internalformat_query2                          DONE (all drivers)
   GL_ARB_invalidate_subdata                             DONE (all drivers)
   GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index d494b84d568..d56f6553fe9 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -44,6 +44,7 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
+<li>GL_ARB_framebuffer_no_attachments on r600, radeonsi</li>
 <li>GL_ARB_internalformat_query2 on all drivers</li>
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
 <li>GL_ARB_shader_image_load_store on radeonsi, softpipe</li>

From 0560c82ff6366edd1ffb52508839586e018457c6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:20:17 +0100
Subject: [PATCH 49/72] r600: cleanup whitespace in evergreen_compute.c

This aligns the code with the style of the rest of the driver.

Makes editing it a lot less painful.

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 162 +++++++++----------
 1 file changed, 75 insertions(+), 87 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index f4b669000dc..4483be358fa 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -83,29 +83,26 @@ writable images will consume TEX slots, VTX slots too because of linear indexing
 
 */
 
-struct r600_resource* r600_compute_buffer_alloc_vram(
-       struct r600_screen *screen,
-       unsigned size)
+struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
+						     unsigned size)
 {
-	struct pipe_resource * buffer = NULL;
+	struct pipe_resource *buffer = NULL;
 	assert(size);
 
-	buffer = pipe_buffer_create(
-		(struct pipe_screen*) screen,
-		PIPE_BIND_CUSTOM,
-		PIPE_USAGE_IMMUTABLE,
-		size);
+	buffer = pipe_buffer_create((struct pipe_screen*) screen,
+				    PIPE_BIND_CUSTOM,
+				    PIPE_USAGE_IMMUTABLE,
+				    size);
 
 	return (struct r600_resource *)buffer;
 }
 
 
-static void evergreen_set_rat(
-	struct r600_pipe_compute *pipe,
-	unsigned id,
-	struct r600_resource* bo,
-	int start,
-	int size)
+static void evergreen_set_rat(struct r600_pipe_compute *pipe,
+			      unsigned id,
+			      struct r600_resource *bo,
+			      int start,
+			      int size)
 {
 	struct pipe_surface rat_templ;
 	struct r600_surface *surf = NULL;
@@ -145,11 +142,10 @@ static void evergreen_set_rat(
 	evergreen_init_color_surface_rat(rctx, surf);
 }
 
-static void evergreen_cs_set_vertex_buffer(
-	struct r600_context * rctx,
-	unsigned vb_index,
-	unsigned offset,
-	struct pipe_resource * buffer)
+static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
+					   unsigned vb_index,
+					   unsigned offset,
+					   struct pipe_resource *buffer)
 {
 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
@@ -166,12 +162,11 @@ static void evergreen_cs_set_vertex_buffer(
 	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
-static void evergreen_cs_set_constant_buffer(
-	struct r600_context * rctx,
-	unsigned cb_index,
-	unsigned offset,
-	unsigned size,
-	struct pipe_resource * buffer)
+static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
+					     unsigned cb_index,
+					     unsigned offset,
+					     unsigned size,
+					     struct pipe_resource *buffer)
 {
 	struct pipe_constant_buffer cb;
 	cb.buffer_size = size;
@@ -256,14 +251,13 @@ static void r600_destroy_shader(struct r600_bytecode *bc)
 	FREE(bc->bytecode);
 }
 
-void *evergreen_create_compute_state(
-	struct pipe_context *ctx_,
-	const const struct pipe_compute_state *cso)
+void *evergreen_create_compute_state(struct pipe_context *ctx_,
+				     const const struct pipe_compute_state *cso)
 {
 	struct r600_context *ctx = (struct r600_context *)ctx_;
 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 #ifdef HAVE_OPENCL
-	const struct pipe_llvm_program_header * header;
+	const struct pipe_llvm_program_header *header;
 	const char *code;
 	void *p;
 	boolean use_kill;
@@ -290,12 +284,13 @@ void *evergreen_create_compute_state(
 	return shader;
 }
 
-void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
+void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
 {
 	struct r600_context *ctx = (struct r600_context *)ctx_;
-	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
 	struct r600_pipe_compute *shader = state;
 
+	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
+
 	if (!shader)
 		return;
 
@@ -327,11 +322,10 @@ static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
  *             (x,y,z)
  * DWORDS 9+ : Kernel parameters
  */
-void evergreen_compute_upload_input(
-	struct pipe_context *ctx_,
-	const uint *block_layout,
-	const uint *grid_layout,
-	const void *input)
+void evergreen_compute_upload_input(struct pipe_context *ctx_,
+				    const uint *block_layout,
+				    const uint *grid_layout,
+				    const void *input)
 {
 	struct r600_context *ctx = (struct r600_context *)ctx_;
 	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
@@ -340,10 +334,10 @@ void evergreen_compute_upload_input(
 	 * parameters.
 	 */
 	unsigned input_size = shader->input_size + 36;
-	uint32_t * num_work_groups_start;
-	uint32_t * global_size_start;
-	uint32_t * local_size_start;
-	uint32_t * kernel_parameters_start;
+	uint32_t *num_work_groups_start;
+	uint32_t *global_size_start;
+	uint32_t *local_size_start;
+	uint32_t *kernel_parameters_start;
 	struct pipe_box box;
 	struct pipe_transfer *transfer = NULL;
 
@@ -393,9 +387,9 @@ void evergreen_compute_upload_input(
 			(struct pipe_resource*)shader->kernel_param);
 }
 
-static void evergreen_emit_direct_dispatch(
-		struct r600_context *rctx,
-		const uint *block_layout, const uint *grid_layout)
+static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
+					   const uint *block_layout,
+					   const uint *grid_layout)
 {
 	int i;
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
@@ -462,8 +456,9 @@ static void evergreen_emit_direct_dispatch(
 	radeon_emit(cs, 1);
 }
 
-static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
-		const uint *grid_layout)
+static void compute_emit_cs(struct r600_context *ctx,
+			    const uint *block_layout,
+			    const uint *grid_layout)
 {
 	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	unsigned i;
@@ -574,9 +569,8 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 /**
  * Emit function for r600_cs_shader_state atom
  */
-void evergreen_emit_cs_shader(
-		struct r600_context *rctx,
-		struct r600_atom *atom)
+void evergreen_emit_cs_shader(struct r600_context *rctx,
+			      struct r600_atom *atom)
 {
 	struct r600_cs_shader_state *state =
 					(struct r600_cs_shader_state*)atom;
@@ -604,8 +598,8 @@ void evergreen_emit_cs_shader(
 					      RADEON_PRIO_USER_SHADER));
 }
 
-static void evergreen_launch_grid(
-		struct pipe_context *ctx_, const struct pipe_grid_info *info)
+static void evergreen_launch_grid(struct pipe_context *ctx_,
+				  const struct pipe_grid_info *info)
 {
 	struct r600_context *ctx = (struct r600_context *)ctx_;
 #ifdef HAVE_OPENCL
@@ -625,9 +619,9 @@ static void evergreen_launch_grid(
 	compute_emit_cs(ctx, info->block, info->grid);
 }
 
-static void evergreen_set_compute_resources(struct pipe_context * ctx_,
-		unsigned start, unsigned count,
-		struct pipe_surface ** surfaces)
+static void evergreen_set_compute_resources(struct pipe_context *ctx_,
+					    unsigned start, unsigned count,
+					    struct pipe_surface **surfaces)
 {
 	struct r600_context *ctx = (struct r600_context *)ctx_;
 	struct r600_surface **resources = (struct r600_surface **)surfaces;
@@ -659,10 +653,10 @@ static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 	}
 }
 
-static void evergreen_set_global_binding(
-	struct pipe_context *ctx_, unsigned first, unsigned n,
-	struct pipe_resource **resources,
-	uint32_t **handles)
+static void evergreen_set_global_binding(struct pipe_context *ctx_,
+					 unsigned first, unsigned n,
+					 struct pipe_resource **resources,
+					 uint32_t **handles)
 {
 	struct r600_context *ctx = (struct r600_context *)ctx_;
 	struct compute_memory_pool *pool = ctx->screen->global_pool;
@@ -914,9 +908,8 @@ void evergreen_init_compute_state_functions(struct r600_context *ctx)
 
 }
 
-struct pipe_resource *r600_compute_global_buffer_create(
-	struct pipe_screen *screen,
-	const struct pipe_resource *templ)
+struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
+							const struct pipe_resource *templ)
 {
 	struct r600_resource_global* result = NULL;
 	struct r600_screen* rscreen = NULL;
@@ -954,9 +947,8 @@ struct pipe_resource *r600_compute_global_buffer_create(
 	return &result->base.b.b;
 }
 
-void r600_compute_global_buffer_destroy(
-	struct pipe_screen *screen,
-	struct pipe_resource *res)
+void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
+					struct pipe_resource *res)
 {
 	struct r600_resource_global* buffer = NULL;
 	struct r600_screen* rscreen = NULL;
@@ -973,13 +965,12 @@ void r600_compute_global_buffer_destroy(
 	free(res);
 }
 
-void *r600_compute_global_transfer_map(
-	struct pipe_context *ctx_,
-	struct pipe_resource *resource,
-	unsigned level,
-	unsigned usage,
-	const struct pipe_box *box,
-	struct pipe_transfer **ptransfer)
+void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
+				       struct pipe_resource *resource,
+				       unsigned level,
+				       unsigned usage,
+				       const struct pipe_box *box,
+				       struct pipe_transfer **ptransfer)
 {
 	struct r600_context *rctx = (struct r600_context*)ctx_;
 	struct compute_memory_pool *pool = rctx->screen->global_pool;
@@ -1025,9 +1016,8 @@ void *r600_compute_global_transfer_map(
 			offset, box->width, usage, ptransfer);
 }
 
-void r600_compute_global_transfer_unmap(
-	struct pipe_context *ctx_,
-	struct pipe_transfer* transfer)
+void r600_compute_global_transfer_unmap(struct pipe_context *ctx_,
+					struct pipe_transfer *transfer)
 {
 	/* struct r600_resource_global are not real resources, they just map
 	 * to an offset within the compute memory pool.  The function
@@ -1042,23 +1032,21 @@ void r600_compute_global_transfer_unmap(
 	assert (!"This function should not be called");
 }
 
-void r600_compute_global_transfer_flush_region(
-	struct pipe_context *ctx_,
-	struct pipe_transfer *transfer,
-	const struct pipe_box *box)
+void r600_compute_global_transfer_flush_region(struct pipe_context *ctx_,
+					       struct pipe_transfer *transfer,
+					       const struct pipe_box *box)
 {
 	assert(0 && "TODO");
 }
 
-void r600_compute_global_transfer_inline_write(
-	struct pipe_context *pipe,
-	struct pipe_resource *resource,
-	unsigned level,
-	unsigned usage,
-	const struct pipe_box *box,
-	const void *data,
-	unsigned stride,
-	unsigned layer_stride)
+void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
+					       struct pipe_resource *resource,
+					       unsigned level,
+					       unsigned usage,
+					       const struct pipe_box *box,
+					       const void *data,
+					       unsigned stride,
+					       unsigned layer_stride)
 {
 	assert(0 && "TODO");
 }

From aeb2be3a2f1839b91532b178b997b20ddb69eb13 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:23:32 +0100
Subject: [PATCH 50/72] r600: use rctx consistently in evergreen_compute.c

Another step towards cleaning this up.

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 148 +++++++++----------
 1 file changed, 74 insertions(+), 74 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 4483be358fa..6abb77f676c 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -254,7 +254,7 @@ static void r600_destroy_shader(struct r600_bytecode *bc)
 void *evergreen_create_compute_state(struct pipe_context *ctx_,
 				     const const struct pipe_compute_state *cso)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 #ifdef HAVE_OPENCL
 	const struct pipe_llvm_program_header *header;
@@ -262,21 +262,21 @@ void *evergreen_create_compute_state(struct pipe_context *ctx_,
 	void *p;
 	boolean use_kill;
 
-	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
+	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
 	header = cso->prog;
 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
 	radeon_shader_binary_init(&shader->binary);
 	radeon_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 
-	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
+	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
 							shader->bc.ndw * 4);
-	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
+	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
-	ctx->b.ws->buffer_unmap(shader->code_bo->buf);
+	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
 #endif
 
-	shader->ctx = ctx;
+	shader->ctx = rctx;
 	shader->local_size = cso->req_local_mem;
 	shader->private_size = cso->req_private_mem;
 	shader->input_size = cso->req_input_mem;
@@ -286,10 +286,10 @@ void *evergreen_create_compute_state(struct pipe_context *ctx_,
 
 void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
 	struct r600_pipe_compute *shader = state;
 
-	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
+	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
 
 	if (!shader)
 		return;
@@ -304,11 +304,11 @@ void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
 
 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
 
-	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
+	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 
-	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
+	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
 }
 
 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
@@ -327,8 +327,8 @@ void evergreen_compute_upload_input(struct pipe_context *ctx_,
 				    const uint *grid_layout,
 				    const void *input)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
-	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned i;
 	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
 	 * parameters.
@@ -376,14 +376,14 @@ void evergreen_compute_upload_input(struct pipe_context *ctx_,
 	memcpy(kernel_parameters_start, input, shader->input_size);
 
 	for (i = 0; i < (input_size / 4); i++) {
-		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
+		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
 			((unsigned*)num_work_groups_start)[i]);
 	}
 
 	ctx_->transfer_unmap(ctx_, transfer);
 
 	/* ID=0 is reserved for the parameters */
-	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
+	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
 			(struct pipe_resource*)shader->kernel_param);
 }
 
@@ -456,16 +456,16 @@ static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
 	radeon_emit(cs, 1);
 }
 
-static void compute_emit_cs(struct r600_context *ctx,
+static void compute_emit_cs(struct r600_context *rctx,
 			    const uint *block_layout,
 			    const uint *grid_layout)
 {
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned i;
 
 	/* make sure that the gfx ring is only one active */
-	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
-		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
+		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 
 	/* Initialize all the compute-related registers.
@@ -473,20 +473,20 @@ static void compute_emit_cs(struct r600_context *ctx,
 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
 	 * of registers initialized by the start_compute_cs_cmd atom.
 	 */
-	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
+	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
 
 	/* emit config state */
-	if (ctx->b.chip_class == EVERGREEN)
-		r600_emit_atom(ctx, &ctx->config_state.atom);
+	if (rctx->b.chip_class == EVERGREEN)
+		r600_emit_atom(rctx, &rctx->config_state.atom);
 
-	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
-	r600_flush_emit(ctx);
+	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
+	r600_flush_emit(rctx);
 
 	/* Emit colorbuffers. */
 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
-	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
-		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
-		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
+	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
+		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
+		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 						       (struct r600_resource*)cb->base.texture,
 						       RADEON_USAGE_READWRITE,
 						       RADEON_PRIO_SHADER_RW_BUFFER);
@@ -515,37 +515,37 @@ static void compute_emit_cs(struct r600_context *ctx,
 
 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
 	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
-					ctx->compute_cb_target_mask);
+					rctx->compute_cb_target_mask);
 
 
 	/* Emit vertex buffer state */
-	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
-	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
+	rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
+	r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
 
 	/* Emit constant buffer state */
-	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
+	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 
 	/* Emit sampler state */
-	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
+	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
 
 	/* Emit sampler view (texture resource) state */
-	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
+	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
 
 	/* Emit compute shader state */
-	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
+	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 
 	/* Emit dispatch state and dispatch packet */
-	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
+	evergreen_emit_direct_dispatch(rctx, block_layout, grid_layout);
 
 	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 	 */
-	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
+	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 		      R600_CONTEXT_INV_VERTEX_CACHE |
 	              R600_CONTEXT_INV_TEX_CACHE;
-	r600_flush_emit(ctx);
-	ctx->b.flags = 0;
+	r600_flush_emit(rctx);
+	rctx->b.flags = 0;
 
-	if (ctx->b.chip_class >= CAYMAN) {
+	if (rctx->b.chip_class >= CAYMAN) {
 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
 		/* DEALLOC_STATE prevents the GPU from hanging when a
@@ -557,9 +557,9 @@ static void compute_emit_cs(struct r600_context *ctx,
 	}
 
 #if 0
-	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
+	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
 	for (i = 0; i < cs->cdw; i++) {
-		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
+		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
 	}
 #endif
 
@@ -601,32 +601,32 @@ void evergreen_emit_cs_shader(struct r600_context *rctx,
 static void evergreen_launch_grid(struct pipe_context *ctx_,
 				  const struct pipe_grid_info *info)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
 #ifdef HAVE_OPENCL
-	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
+	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	boolean use_kill;
 
-	ctx->cs_shader_state.pc = info->pc;
+	rctx->cs_shader_state.pc = info->pc;
 	/* Get the config information for this kernel. */
 	r600_shader_binary_read_config(&shader->binary, &shader->bc,
                                   info->pc, &use_kill);
 #endif
 
-	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
+	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 
 
 	evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
-	compute_emit_cs(ctx, info->block, info->grid);
+	compute_emit_cs(rctx, info->block, info->grid);
 }
 
 static void evergreen_set_compute_resources(struct pipe_context *ctx_,
 					    unsigned start, unsigned count,
 					    struct pipe_surface **surfaces)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
 	struct r600_surface **resources = (struct r600_surface **)surfaces;
 
-	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
+	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
 			start, count);
 
 	for (unsigned i = 0; i < count; i++) {
@@ -640,13 +640,13 @@ static void evergreen_set_compute_resources(struct pipe_context *ctx_,
 			if (resources[i]->base.writable) {
 				assert(i+1 < 12);
 
-				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
+				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
 				(struct r600_resource *)resources[i]->base.texture,
 				buffer->chunk->start_in_dw*4,
 				resources[i]->base.texture->width0);
 			}
 
-			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
+			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
 					buffer->chunk->start_in_dw * 4,
 					resources[i]->base.texture);
 		}
@@ -658,13 +658,13 @@ static void evergreen_set_global_binding(struct pipe_context *ctx_,
 					 struct pipe_resource **resources,
 					 uint32_t **handles)
 {
-	struct r600_context *ctx = (struct r600_context *)ctx_;
-	struct compute_memory_pool *pool = ctx->screen->global_pool;
+	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct compute_memory_pool *pool = rctx->screen->global_pool;
 	struct r600_resource_global **buffers =
 		(struct r600_resource_global **)resources;
 	unsigned i;
 
-	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
+	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
 			first, n);
 
 	if (!resources) {
@@ -699,8 +699,8 @@ static void evergreen_set_global_binding(struct pipe_context *ctx_,
 		*(handles[i]) = util_cpu_to_le32(handle);
 	}
 
-	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
-	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
+	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
+	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
 				(struct pipe_resource*)pool->bo);
 }
 
@@ -715,9 +715,9 @@ static void evergreen_set_global_binding(struct pipe_context *ctx_,
  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
  * on the GPU family.
  */
-void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
+void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
 {
-	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
+	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
 	int num_threads;
 	int num_stack_entries;
 
@@ -736,7 +736,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 
-	switch (ctx->b.family) {
+	switch (rctx->b.family) {
 	case CHIP_CEDAR:
 	default:
 		num_threads = 128;
@@ -782,18 +782,18 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 	}
 
 	/* Config Registers */
-	if (ctx->b.chip_class < CAYMAN)
-		evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
-					   ctx->screen->b.info.drm_minor);
+	if (rctx->b.chip_class < CAYMAN)
+		evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
+					   rctx->screen->b.info.drm_minor);
 	else
-		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
-					ctx->screen->b.info.drm_minor);
+		cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
+					rctx->screen->b.info.drm_minor);
 
 	/* The primitive type always needs to be POINTLIST for compute. */
 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
 						V_008958_DI_PT_POINTLIST);
 
-	if (ctx->b.chip_class < CAYMAN) {
+	if (rctx->b.chip_class < CAYMAN) {
 
 		/* These registers control which simds can be used by each stage.
 		 * The default for these registers is 0xffffffff, which means
@@ -843,7 +843,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 	 * allocate the appropriate amount of LDS dwords using the
 	 * CM_R_0288E8_SQ_LDS_ALLOC register.
 	 */
-	if (ctx->b.chip_class < CAYMAN) {
+	if (rctx->b.chip_class < CAYMAN) {
 		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
 			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
 	} else {
@@ -854,7 +854,7 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 
 	/* Context Registers */
 
-	if (ctx->b.chip_class < CAYMAN) {
+	if (rctx->b.chip_class < CAYMAN) {
 		/* workaround for hw issues with dyn gpr - must set all limits
 		 * to 240 instead of 0, 0x1e == 240 / 8
 		 */
@@ -896,15 +896,15 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
 }
 
-void evergreen_init_compute_state_functions(struct r600_context *ctx)
+void evergreen_init_compute_state_functions(struct r600_context *rctx)
 {
-	ctx->b.b.create_compute_state = evergreen_create_compute_state;
-	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
-	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
-//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
-	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
-	ctx->b.b.set_global_binding = evergreen_set_global_binding;
-	ctx->b.b.launch_grid = evergreen_launch_grid;
+	rctx->b.b.create_compute_state = evergreen_create_compute_state;
+	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
+	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
+//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
+	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
+	rctx->b.b.set_global_binding = evergreen_set_global_binding;
+	rctx->b.b.launch_grid = evergreen_launch_grid;
 
 }
 

From a6e17d7d698bb19b28e4120c1587605545723c1e Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:24:35 +0100
Subject: [PATCH 51/72] r600: in evergreen_compute use ctx consistently instead
 of ctx_

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 50 ++++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 6abb77f676c..6f317b4e2d7 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -251,10 +251,10 @@ static void r600_destroy_shader(struct r600_bytecode *bc)
 	FREE(bc->bytecode);
 }
 
-void *evergreen_create_compute_state(struct pipe_context *ctx_,
+void *evergreen_create_compute_state(struct pipe_context *ctx,
 				     const const struct pipe_compute_state *cso)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
 #ifdef HAVE_OPENCL
 	const struct pipe_llvm_program_header *header;
@@ -284,9 +284,9 @@ void *evergreen_create_compute_state(struct pipe_context *ctx_,
 	return shader;
 }
 
-void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
+void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_compute *shader = state;
 
 	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
@@ -302,9 +302,9 @@ void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
 	FREE(shader);
 }
 
-static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
+static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
 
@@ -322,12 +322,12 @@ static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
  *             (x,y,z)
  * DWORDS 9+ : Kernel parameters
  */
-void evergreen_compute_upload_input(struct pipe_context *ctx_,
+void evergreen_compute_upload_input(struct pipe_context *ctx,
 				    const uint *block_layout,
 				    const uint *grid_layout,
 				    const void *input)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned i;
 	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
@@ -348,12 +348,12 @@ void evergreen_compute_upload_input(struct pipe_context *ctx_,
 	if (!shader->kernel_param) {
 		/* Add space for the grid dimensions */
 		shader->kernel_param = (struct r600_resource *)
-			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
+			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
 					PIPE_USAGE_IMMUTABLE, input_size);
 	}
 
 	u_box_1d(0, input_size, &box);
-	num_work_groups_start = ctx_->transfer_map(ctx_,
+	num_work_groups_start = ctx->transfer_map(ctx,
 			(struct pipe_resource*)shader->kernel_param,
 			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
 			&box, &transfer);
@@ -380,7 +380,7 @@ void evergreen_compute_upload_input(struct pipe_context *ctx_,
 			((unsigned*)num_work_groups_start)[i]);
 	}
 
-	ctx_->transfer_unmap(ctx_, transfer);
+	ctx->transfer_unmap(ctx, transfer);
 
 	/* ID=0 is reserved for the parameters */
 	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
@@ -598,10 +598,10 @@ void evergreen_emit_cs_shader(struct r600_context *rctx,
 					      RADEON_PRIO_USER_SHADER));
 }
 
-static void evergreen_launch_grid(struct pipe_context *ctx_,
+static void evergreen_launch_grid(struct pipe_context *ctx,
 				  const struct pipe_grid_info *info)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 #ifdef HAVE_OPENCL
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	boolean use_kill;
@@ -615,15 +615,15 @@ static void evergreen_launch_grid(struct pipe_context *ctx_,
 	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 
 
-	evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
+	evergreen_compute_upload_input(ctx, info->block, info->grid, info->input);
 	compute_emit_cs(rctx, info->block, info->grid);
 }
 
-static void evergreen_set_compute_resources(struct pipe_context *ctx_,
+static void evergreen_set_compute_resources(struct pipe_context *ctx,
 					    unsigned start, unsigned count,
 					    struct pipe_surface **surfaces)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_surface **resources = (struct r600_surface **)surfaces;
 
 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
@@ -653,12 +653,12 @@ static void evergreen_set_compute_resources(struct pipe_context *ctx_,
 	}
 }
 
-static void evergreen_set_global_binding(struct pipe_context *ctx_,
+static void evergreen_set_global_binding(struct pipe_context *ctx,
 					 unsigned first, unsigned n,
 					 struct pipe_resource **resources,
 					 uint32_t **handles)
 {
-	struct r600_context *rctx = (struct r600_context *)ctx_;
+	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct compute_memory_pool *pool = rctx->screen->global_pool;
 	struct r600_resource_global **buffers =
 		(struct r600_resource_global **)resources;
@@ -681,7 +681,7 @@ static void evergreen_set_global_binding(struct pipe_context *ctx_,
 			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
 	}
 
-	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
+	if (compute_memory_finalize_pending(pool, ctx) == -1) {
 		/* XXX: Unset */
 		return;
 	}
@@ -965,14 +965,14 @@ void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
 	free(res);
 }
 
-void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
+void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 				       struct pipe_resource *resource,
 				       unsigned level,
 				       unsigned usage,
 				       const struct pipe_box *box,
 				       struct pipe_transfer **ptransfer)
 {
-	struct r600_context *rctx = (struct r600_context*)ctx_;
+	struct r600_context *rctx = (struct r600_context*)ctx;
 	struct compute_memory_pool *pool = rctx->screen->global_pool;
 	struct r600_resource_global* buffer =
 		(struct r600_resource_global*)resource;
@@ -982,7 +982,7 @@ void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
 	unsigned offset = box->x;
 
 	if (is_item_in_pool(item)) {
-		compute_memory_demote_item(pool, item, ctx_);
+		compute_memory_demote_item(pool, item, ctx);
 	}
 	else {
 		if (item->real_buffer == NULL) {
@@ -1012,11 +1012,11 @@ void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
 	assert(box->z == 0);
 
 	///TODO: do it better, mapping is not possible if the pool is too big
-	return pipe_buffer_map_range(ctx_, dst,
+	return pipe_buffer_map_range(ctx, dst,
 			offset, box->width, usage, ptransfer);
 }
 
-void r600_compute_global_transfer_unmap(struct pipe_context *ctx_,
+void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 					struct pipe_transfer *transfer)
 {
 	/* struct r600_resource_global are not real resources, they just map
@@ -1032,7 +1032,7 @@ void r600_compute_global_transfer_unmap(struct pipe_context *ctx_,
 	assert (!"This function should not be called");
 }
 
-void r600_compute_global_transfer_flush_region(struct pipe_context *ctx_,
+void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
 					       struct pipe_transfer *transfer,
 					       const struct pipe_box *box)
 {

From 41558efa87d610515f88d2c562c785f976f4f641 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:28:23 +0100
Subject: [PATCH 52/72] r600: using pipe_grid_info more in evergreen_compute.

No reason to pull the pieces apart here, also make
one of the functions static as it's unused outside this.

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 46 +++++++++-----------
 src/gallium/drivers/r600/evergreen_compute.h |  1 -
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 6f317b4e2d7..6cd49c94226 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -322,10 +322,8 @@ static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
  *             (x,y,z)
  * DWORDS 9+ : Kernel parameters
  */
-void evergreen_compute_upload_input(struct pipe_context *ctx,
-				    const uint *block_layout,
-				    const uint *grid_layout,
-				    const void *input)
+static void evergreen_compute_upload_input(struct pipe_context *ctx,
+					   const struct pipe_grid_info *info)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
@@ -362,18 +360,18 @@ void evergreen_compute_upload_input(struct pipe_context *ctx,
 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
 
 	/* Copy the work group size */
-	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
+	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
 
 	/* Copy the global size */
 	for (i = 0; i < 3; i++) {
-		global_size_start[i] = grid_layout[i] * block_layout[i];
+		global_size_start[i] = info->grid[i] * info->block[i];
 	}
 
 	/* Copy the local dimensions */
-	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
+	memcpy(local_size_start, info->block, 3 * sizeof(uint));
 
 	/* Copy the kernel inputs */
-	memcpy(kernel_parameters_start, input, shader->input_size);
+	memcpy(kernel_parameters_start, info->input, shader->input_size);
 
 	for (i = 0; i < (input_size / 4); i++) {
 		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
@@ -387,9 +385,8 @@ void evergreen_compute_upload_input(struct pipe_context *ctx,
 			(struct pipe_resource*)shader->kernel_param);
 }
 
-static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
-					   const uint *block_layout,
-					   const uint *grid_layout)
+static void evergreen_emit_dispatch(struct r600_context *rctx,
+				    const struct pipe_grid_info *info)
 {
 	int i;
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
@@ -405,15 +402,15 @@ static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
 
 	/* Calculate group_size/grid_size */
 	for (i = 0; i < 3; i++) {
-		group_size *= block_layout[i];
+		group_size *= info->block[i];
 	}
 
 	for (i = 0; i < 3; i++)	{
-		grid_size *= grid_layout[i];
+		grid_size *= info->grid[i];
 	}
 
 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
-	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
+	num_waves = (info->block[0] * info->block[1] * info->block[2] +
 			wave_divisor - 1) / wave_divisor;
 
 	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
@@ -432,9 +429,9 @@ static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
 								group_size);
 
 	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
-	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
-	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
-	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
+	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
+	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
+	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
 
 	if (rctx->b.chip_class < CAYMAN) {
 		assert(lds_size <= 8192);
@@ -449,16 +446,15 @@ static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
 
 	/* Dispatch packet */
 	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
-	radeon_emit(cs, grid_layout[0]);
-	radeon_emit(cs, grid_layout[1]);
-	radeon_emit(cs, grid_layout[2]);
+	radeon_emit(cs, info->grid[0]);
+	radeon_emit(cs, info->grid[1]);
+	radeon_emit(cs, info->grid[2]);
 	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
 	radeon_emit(cs, 1);
 }
 
 static void compute_emit_cs(struct r600_context *rctx,
-			    const uint *block_layout,
-			    const uint *grid_layout)
+			    const struct pipe_grid_info *info)
 {
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned i;
@@ -535,7 +531,7 @@ static void compute_emit_cs(struct r600_context *rctx,
 	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
 
 	/* Emit dispatch state and dispatch packet */
-	evergreen_emit_direct_dispatch(rctx, block_layout, grid_layout);
+	evergreen_emit_dispatch(rctx, info);
 
 	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
 	 */
@@ -615,8 +611,8 @@ static void evergreen_launch_grid(struct pipe_context *ctx,
 	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
 
 
-	evergreen_compute_upload_input(ctx, info->block, info->grid, info->input);
-	compute_emit_cs(rctx, info->block, info->grid);
+	evergreen_compute_upload_input(ctx, info);
+	compute_emit_cs(rctx, info);
 }
 
 static void evergreen_set_compute_resources(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/r600/evergreen_compute.h b/src/gallium/drivers/r600/evergreen_compute.h
index e4d3a38e415..f1f608c15ad 100644
--- a/src/gallium/drivers/r600/evergreen_compute.h
+++ b/src/gallium/drivers/r600/evergreen_compute.h
@@ -40,7 +40,6 @@ struct r600_resource_global {
 
 void *evergreen_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso);
 void evergreen_delete_compute_state(struct pipe_context *ctx, void *state);
-void evergreen_compute_upload_input(struct pipe_context *context, const uint *block_layout, const uint *grid_layout, const void *input);
 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx);
 void evergreen_init_compute_state_functions(struct r600_context *rctx);
 void evergreen_emit_cs_shader(struct r600_context *rctx, struct r600_atom * atom);

From a5d247dda0d0dfcae5c1e2b730cb7e53e5315676 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:35:12 +0100
Subject: [PATCH 53/72] r600: make two compute functions static.

These aren't used outside evergreen_compute.c

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 6 +++---
 src/gallium/drivers/r600/evergreen_compute.h | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 6cd49c94226..b93ec992111 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -251,8 +251,8 @@ static void r600_destroy_shader(struct r600_bytecode *bc)
 	FREE(bc->bytecode);
 }
 
-void *evergreen_create_compute_state(struct pipe_context *ctx,
-				     const const struct pipe_compute_state *cso)
+static void *evergreen_create_compute_state(struct pipe_context *ctx,
+					    const const struct pipe_compute_state *cso)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
@@ -284,7 +284,7 @@ void *evergreen_create_compute_state(struct pipe_context *ctx,
 	return shader;
 }
 
-void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
+static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_compute *shader = state;
diff --git a/src/gallium/drivers/r600/evergreen_compute.h b/src/gallium/drivers/r600/evergreen_compute.h
index f1f608c15ad..59e39462988 100644
--- a/src/gallium/drivers/r600/evergreen_compute.h
+++ b/src/gallium/drivers/r600/evergreen_compute.h
@@ -38,8 +38,6 @@ struct r600_resource_global {
 	struct compute_memory_item *chunk;
 };
 
-void *evergreen_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso);
-void evergreen_delete_compute_state(struct pipe_context *ctx, void *state);
 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx);
 void evergreen_init_compute_state_functions(struct r600_context *rctx);
 void evergreen_emit_cs_shader(struct r600_context *rctx, struct r600_atom * atom);

From 0c40b6f96c705d1e67b5d58ff90774cd0be524a7 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:35:53 +0100
Subject: [PATCH 54/72] r600: make compute global buffer functions static.

This moves things around so that the global buffer handling
functions in evergreen_compute.c are static.

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 172 +++++++++----------
 src/gallium/drivers/r600/evergreen_compute.h |  12 --
 2 files changed, 86 insertions(+), 98 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index b93ec992111..07977b7db3e 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -177,16 +177,6 @@ static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
 	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
 }
 
-static const struct u_resource_vtbl r600_global_buffer_vtbl =
-{
-	u_default_resource_get_handle, /* get_handle */
-	r600_compute_global_buffer_destroy, /* resource_destroy */
-	r600_compute_global_transfer_map, /* transfer_map */
-	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
-	r600_compute_global_transfer_unmap, /* transfer_unmap */
-	r600_compute_global_transfer_inline_write /* transfer_inline_write */
-};
-
 /* We need to define these R600 registers here, because we can't include
  * evergreend.h and r600d.h.
  */
@@ -904,69 +894,12 @@ void evergreen_init_compute_state_functions(struct r600_context *rctx)
 
 }
 
-struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
-							const struct pipe_resource *templ)
-{
-	struct r600_resource_global* result = NULL;
-	struct r600_screen* rscreen = NULL;
-	int size_in_dw = 0;
-
-	assert(templ->target == PIPE_BUFFER);
-	assert(templ->bind & PIPE_BIND_GLOBAL);
-	assert(templ->array_size == 1 || templ->array_size == 0);
-	assert(templ->depth0 == 1 || templ->depth0 == 0);
-	assert(templ->height0 == 1 || templ->height0 == 0);
-
-	result = (struct r600_resource_global*)
-	CALLOC(sizeof(struct r600_resource_global), 1);
-	rscreen = (struct r600_screen*)screen;
-
-	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
-	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
-			templ->array_size);
-
-	result->base.b.vtbl = &r600_global_buffer_vtbl;
-	result->base.b.b = *templ;
-	result->base.b.b.screen = screen;
-	pipe_reference_init(&result->base.b.b.reference, 1);
-
-	size_in_dw = (templ->width0+3) / 4;
-
-	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
-
-	if (result->chunk == NULL)
-	{
-		free(result);
-		return NULL;
-	}
-
-	return &result->base.b.b;
-}
-
-void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
-					struct pipe_resource *res)
-{
-	struct r600_resource_global* buffer = NULL;
-	struct r600_screen* rscreen = NULL;
-
-	assert(res->target == PIPE_BUFFER);
-	assert(res->bind & PIPE_BIND_GLOBAL);
-
-	buffer = (struct r600_resource_global*)res;
-	rscreen = (struct r600_screen*)screen;
-
-	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
-
-	buffer->chunk = NULL;
-	free(res);
-}
-
-void *r600_compute_global_transfer_map(struct pipe_context *ctx,
-				       struct pipe_resource *resource,
-				       unsigned level,
-				       unsigned usage,
-				       const struct pipe_box *box,
-				       struct pipe_transfer **ptransfer)
+static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
+					      struct pipe_resource *resource,
+					      unsigned level,
+					      unsigned usage,
+					      const struct pipe_box *box,
+					      struct pipe_transfer **ptransfer)
 {
 	struct r600_context *rctx = (struct r600_context*)ctx;
 	struct compute_memory_pool *pool = rctx->screen->global_pool;
@@ -1012,8 +945,8 @@ void *r600_compute_global_transfer_map(struct pipe_context *ctx,
 			offset, box->width, usage, ptransfer);
 }
 
-void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
-					struct pipe_transfer *transfer)
+static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
+					       struct pipe_transfer *transfer)
 {
 	/* struct r600_resource_global are not real resources, they just map
 	 * to an offset within the compute memory pool.  The function
@@ -1028,21 +961,88 @@ void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
 	assert (!"This function should not be called");
 }
 
-void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
-					       struct pipe_transfer *transfer,
-					       const struct pipe_box *box)
+static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
+						      struct pipe_transfer *transfer,
+						      const struct pipe_box *box)
 {
 	assert(0 && "TODO");
 }
 
-void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
-					       struct pipe_resource *resource,
-					       unsigned level,
-					       unsigned usage,
-					       const struct pipe_box *box,
-					       const void *data,
-					       unsigned stride,
-					       unsigned layer_stride)
+static void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
+						      struct pipe_resource *resource,
+						      unsigned level,
+						      unsigned usage,
+						      const struct pipe_box *box,
+						      const void *data,
+						      unsigned stride,
+						      unsigned layer_stride)
 {
 	assert(0 && "TODO");
 }
+
+static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
+					       struct pipe_resource *res)
+{
+	struct r600_resource_global* buffer = NULL;
+	struct r600_screen* rscreen = NULL;
+
+	assert(res->target == PIPE_BUFFER);
+	assert(res->bind & PIPE_BIND_GLOBAL);
+
+	buffer = (struct r600_resource_global*)res;
+	rscreen = (struct r600_screen*)screen;
+
+	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
+
+	buffer->chunk = NULL;
+	free(res);
+}
+
+static const struct u_resource_vtbl r600_global_buffer_vtbl =
+{
+	u_default_resource_get_handle, /* get_handle */
+	r600_compute_global_buffer_destroy, /* resource_destroy */
+	r600_compute_global_transfer_map, /* transfer_map */
+	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
+	r600_compute_global_transfer_unmap, /* transfer_unmap */
+	r600_compute_global_transfer_inline_write /* transfer_inline_write */
+};
+
+struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
+							const struct pipe_resource *templ)
+{
+	struct r600_resource_global* result = NULL;
+	struct r600_screen* rscreen = NULL;
+	int size_in_dw = 0;
+
+	assert(templ->target == PIPE_BUFFER);
+	assert(templ->bind & PIPE_BIND_GLOBAL);
+	assert(templ->array_size == 1 || templ->array_size == 0);
+	assert(templ->depth0 == 1 || templ->depth0 == 0);
+	assert(templ->height0 == 1 || templ->height0 == 0);
+
+	result = (struct r600_resource_global*)
+	CALLOC(sizeof(struct r600_resource_global), 1);
+	rscreen = (struct r600_screen*)screen;
+
+	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
+	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
+			templ->array_size);
+
+	result->base.b.vtbl = &r600_global_buffer_vtbl;
+	result->base.b.b = *templ;
+	result->base.b.b.screen = screen;
+	pipe_reference_init(&result->base.b.b.reference, 1);
+
+	size_in_dw = (templ->width0+3) / 4;
+
+	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
+
+	if (result->chunk == NULL)
+	{
+		free(result);
+		return NULL;
+	}
+
+	return &result->base.b.b;
+}
diff --git a/src/gallium/drivers/r600/evergreen_compute.h b/src/gallium/drivers/r600/evergreen_compute.h
index 59e39462988..3c178870d91 100644
--- a/src/gallium/drivers/r600/evergreen_compute.h
+++ b/src/gallium/drivers/r600/evergreen_compute.h
@@ -44,17 +44,5 @@ void evergreen_emit_cs_shader(struct r600_context *rctx, struct r600_atom * atom
 
 struct r600_resource* r600_compute_buffer_alloc_vram(struct r600_screen *screen, unsigned size);
 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ);
-void r600_compute_global_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *res);
-void *r600_compute_global_transfer_map(
-	struct pipe_context *ctx_,
-	struct pipe_resource *resource,
-	unsigned level,
-	unsigned usage,
-	const struct pipe_box *box,
-	struct pipe_transfer **ptransfer);
-void r600_compute_global_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer);
-void r600_compute_global_transfer_flush_region( struct pipe_context *, struct pipe_transfer *, const struct pipe_box *);
-void r600_compute_global_transfer_inline_write( struct pipe_context *, struct pipe_resource *, unsigned level,
-                                                unsigned usage, const struct pipe_box *, const void *data, unsigned stride, unsigned layer_stride);
 
 #endif

From 828d84c8e2a85a771adf80b3b7f47d1e24f4840a Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 6 Apr 2016 22:38:21 +0100
Subject: [PATCH 55/72] r600: use radeon_emit in a few more places in
 evergreen_compute

This is just a cleanup of the code.

Acked-by: Tom Stellard <thomas.stellard@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/evergreen_compute.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 07977b7db3e..6f171487f92 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -532,14 +532,14 @@ static void compute_emit_cs(struct r600_context *rctx,
 	rctx->b.flags = 0;
 
 	if (rctx->b.chip_class >= CAYMAN) {
-		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
-		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 		/* DEALLOC_STATE prevents the GPU from hanging when a
 		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
 		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
 		 */
-		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
-		cs->buf[cs->cdw++] = 0;
+		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
+		radeon_emit(cs, 0);
 	}
 
 #if 0

From 05db68024853fe3613480ca164485ccf67f1a7cc Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Fri, 25 Mar 2016 16:12:19 -0700
Subject: [PATCH 56/72] nir/types: Add a wrapper for count_attribute_slots

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir_types.cpp | 7 +++++++
 src/compiler/nir_types.h   | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index 3669cfed360..70e9cd397fc 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -124,6 +124,13 @@ glsl_get_aoa_size(const struct glsl_type *type)
    return type->arrays_of_arrays_size();
 }
 
+unsigned
+glsl_count_attribute_slots(const struct glsl_type *type,
+                           bool vertex_input_slots)
+{
+   return type->count_attribute_slots(vertex_input_slots);
+}
+
 const char *
 glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index)
 {
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 07487838b50..5efdd85dea5 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -68,6 +68,9 @@ unsigned glsl_get_length(const struct glsl_type *type);
 
 unsigned glsl_get_aoa_size(const struct glsl_type *type);
 
+unsigned glsl_count_attribute_slots(const struct glsl_type *type,
+                                    bool vertex_input_slots);
+
 const char *glsl_get_struct_elem_name(const struct glsl_type *type,
                                       unsigned index);
 

From 715e97e3421ec33c219c9829e7930d135dfe0bf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Thu, 7 Apr 2016 15:05:33 +0900
Subject: [PATCH 57/72] Revert "clover: Fix build against clang SVN >= r265359"

This reverts commit 0daab9878d2b96356cf667591a2c877d912be52d.

The corresponding clang change was reverted.

Trivial.
---
 src/gallium/state_trackers/clover/llvm/invocation.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 3fb35969a81..4d11c2477c7 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -206,9 +206,6 @@ namespace {
       // http://www.llvm.org/bugs/show_bug.cgi?id=19735
       c.getDiagnosticOpts().ShowCarets = false;
       c.getInvocation().setLangDefaults(c.getLangOpts(), clang::IK_OpenCL,
-#if HAVE_LLVM >= 0x0309
-                                        llvm::Triple(triple),
-#endif
                                         clang::LangStandard::lang_opencl11);
       c.createDiagnostics(
                           new clang::TextDiagnosticPrinter(

From baa0b3f4ccd960bb6131ba16b1f9d8736c6432c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 7 Apr 2016 00:49:32 +0200
Subject: [PATCH 58/72] radeonsi: don't use the real barrier instruction in
 tess ctrl shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index bf3f00867e9..08da3e37550 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4282,6 +4282,14 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 
+	/* The real barrier instruction isn’t needed, because an entire patch
+	 * always fits into a single wave.
+	 */
+	if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) {
+		emit_optimization_barrier(ctx);
+		return;
+	}
+
 	lp_build_intrinsic(gallivm->builder,
 			   HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
 					       : "llvm.AMDGPU.barrier.local",

From 5fac4887d865305f919e1d23cdb3a6f6d7043884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 7 Apr 2016 02:27:01 +0200
Subject: [PATCH 59/72] radeonsi: disable perfect ZPASS counts for
 PIPE_QUERY_OCCLUSION_PREDICATE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.h |  3 ++-
 src/gallium/drivers/radeon/r600_query.c       | 14 +++++++++++---
 src/gallium/drivers/radeonsi/si_state.c       |  6 ++++--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 381ad21a4e3..062c3193947 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -425,8 +425,9 @@ struct r600_common_context {
 	unsigned flags; /* flush flags */
 
 	/* Queries. */
-	/* The list of active queries. Only one query of each type can be active. */
+	/* The list of active queries. */
 	int				num_occlusion_queries;
+	int				num_perfect_occlusion_queries;
 	/* Keep track of non-timer queries, because they should be suspended
 	 * during context flushing.
 	 * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index f9a5721fb97..7a2d2ee7f31 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -414,14 +414,22 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
 	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
 	    type == PIPE_QUERY_OCCLUSION_PREDICATE) {
 		bool old_enable = rctx->num_occlusion_queries != 0;
-		bool enable;
+		bool old_perfect_enable =
+			rctx->num_perfect_occlusion_queries != 0;
+		bool enable, perfect_enable;
 
 		rctx->num_occlusion_queries += diff;
 		assert(rctx->num_occlusion_queries >= 0);
 
-		enable = rctx->num_occlusion_queries != 0;
+		if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
+			rctx->num_perfect_occlusion_queries += diff;
+			assert(rctx->num_perfect_occlusion_queries >= 0);
+		}
 
-		if (enable != old_enable) {
+		enable = rctx->num_occlusion_queries != 0;
+		perfect_enable = rctx->num_perfect_occlusion_queries != 0;
+
+		if (enable != old_enable || perfect_enable != old_perfect_enable) {
 			rctx->set_occlusion_query_state(&rctx->b, enable);
 		}
 	}
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index f559c73f065..057458c843a 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1310,16 +1310,18 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
 
 	/* DB_COUNT_CONTROL (occlusion queries) */
 	if (sctx->b.num_occlusion_queries > 0) {
+		bool perfect = sctx->b.num_perfect_occlusion_queries > 0;
+
 		if (sctx->b.chip_class >= CIK) {
 			radeon_emit(cs,
-				    S_028004_PERFECT_ZPASS_COUNTS(1) |
+				    S_028004_PERFECT_ZPASS_COUNTS(perfect) |
 				    S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |
 				    S_028004_ZPASS_ENABLE(1) |
 				    S_028004_SLICE_EVEN_ENABLE(1) |
 				    S_028004_SLICE_ODD_ENABLE(1));
 		} else {
 			radeon_emit(cs,
-				    S_028004_PERFECT_ZPASS_COUNTS(1) |
+				    S_028004_PERFECT_ZPASS_COUNTS(perfect) |
 				    S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples));
 		}
 	} else {

From 60cf2fa477c1a91c1f8daea14d14edca80f1e183 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 5 Apr 2016 19:37:16 +0200
Subject: [PATCH 60/72] trace: add missing set_shader_images()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/trace/tr_context.c    | 39 ++++++++++++++++++++++
 src/gallium/drivers/trace/tr_dump_state.c | 40 +++++++++++++++++++++++
 src/gallium/drivers/trace/tr_dump_state.h |  2 ++
 3 files changed, 81 insertions(+)

diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 57f851833e5..08b1d32afb0 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -1686,6 +1686,44 @@ static void trace_context_set_shader_buffers(struct pipe_context *_context,
       FREE(_buffers);
 }
 
+static void trace_context_set_shader_images(struct pipe_context *_context,
+                                            unsigned shader,
+                                            unsigned start, unsigned nr,
+                                            struct pipe_image_view *images)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_image_view *_images = NULL;
+
+   trace_dump_call_begin("pipe_context", "set_shader_images");
+   trace_dump_arg(ptr, context);
+   trace_dump_arg(uint, shader);
+   trace_dump_arg(uint, start);
+   trace_dump_arg_begin("images");
+   trace_dump_struct_array(image_view, images, nr);
+   trace_dump_arg_end();
+   trace_dump_call_end();
+
+   if (images) {
+      int i;
+
+      _images = MALLOC(nr * sizeof(struct pipe_image_view));
+      if (!_images)
+         return;
+
+      for (i = 0; i < nr; i++) {
+         _images[i] = images[i];
+         _images[i].resource = trace_resource_unwrap(tr_context,
+                                                     _images[i].resource);
+      }
+   }
+
+   context->set_shader_images(context, shader, start, nr, _images);
+
+   if (_images)
+      FREE(_images);
+}
+
 static void trace_context_launch_grid(struct pipe_context *_pipe,
                                       const struct pipe_grid_info *info)
 {
@@ -1809,6 +1847,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(set_tess_state);
    TR_CTX_INIT(set_shader_buffers);
    TR_CTX_INIT(launch_grid);
+   TR_CTX_INIT(set_shader_images);
 
    TR_CTX_INIT(transfer_map);
    TR_CTX_INIT(transfer_unmap);
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index e805706f19d..591ca79a2fa 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -740,6 +740,46 @@ void trace_dump_shader_buffer(const struct pipe_shader_buffer *state)
 }
 
 
+void trace_dump_image_view(const struct pipe_image_view *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_image_view");
+   trace_dump_member(resource_ptr, state, resource);
+   trace_dump_member(uint, state, format);
+   trace_dump_member(uint, state, access);
+
+   trace_dump_member_begin("u");
+   trace_dump_struct_begin(""); /* anonymous */
+   if (state->resource->target == PIPE_BUFFER) {
+      trace_dump_member_begin("buf");
+      trace_dump_struct_begin(""); /* anonymous */
+      trace_dump_member(uint, &state->u.buf, first_element);
+      trace_dump_member(uint, &state->u.buf, last_element);
+      trace_dump_struct_end(); /* anonymous */
+      trace_dump_member_end(); /* buf */
+   } else {
+      trace_dump_member_begin("tex");
+      trace_dump_struct_begin(""); /* anonymous */
+      trace_dump_member(uint, &state->u.tex, first_layer);
+      trace_dump_member(uint, &state->u.tex, last_layer);
+      trace_dump_member(uint, &state->u.tex, level);
+      trace_dump_struct_end(); /* anonymous */
+      trace_dump_member_end(); /* tex */
+   }
+   trace_dump_struct_end(); /* anonymous */
+   trace_dump_member_end(); /* u */
+
+   trace_dump_struct_end();
+}
+
+
 void trace_dump_draw_info(const struct pipe_draw_info *state)
 {
    if (!trace_dumping_enabled_locked())
diff --git a/src/gallium/drivers/trace/tr_dump_state.h b/src/gallium/drivers/trace/tr_dump_state.h
index ee0720d8ac8..fd2bc503052 100644
--- a/src/gallium/drivers/trace/tr_dump_state.h
+++ b/src/gallium/drivers/trace/tr_dump_state.h
@@ -91,4 +91,6 @@ void trace_dump_query_result(unsigned query_type,
 
 void trace_dump_grid_info(const struct pipe_grid_info *state);
 
+void trace_dump_image_view(const struct pipe_image_view *view);
+
 #endif /* TR_STATE_H */

From 9f443af449515240169e69b98bdb3746f38e7f35 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 5 Apr 2016 09:56:49 -0600
Subject: [PATCH 61/72] svga: add some trivial null pointer checks

These small mallocs will probably never fail, but static analysis tools
may complain about the missing checks.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/svga/svga_pipe_blend.c        | 3 +++
 src/gallium/drivers/svga/svga_pipe_depthstencil.c | 3 +++
 src/gallium/drivers/svga/svga_pipe_rasterizer.c   | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index 0af80cd4296..0ba9313fd5e 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -142,6 +142,9 @@ svga_create_blend_state(struct pipe_context *pipe,
    struct svga_blend_state *blend = CALLOC_STRUCT( svga_blend_state );
    unsigned i;
 
+   if (!blend)
+      return NULL;
+
    /* Fill in the per-rendertarget blend state.  We currently only
     * support independent blend enable and colormask per render target.
     */
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
index d84ed1df48e..83fcdc3d80b 100644
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -134,6 +134,9 @@ svga_create_depth_stencil_state(struct pipe_context *pipe,
    struct svga_context *svga = svga_context(pipe);
    struct svga_depth_stencil_state *ds = CALLOC_STRUCT( svga_depth_stencil_state );
 
+   if (!ds)
+      return NULL;
+
    /* Don't try to figure out CW/CCW correspondence with
     * stencil[0]/[1] at this point.  Presumably this can change as
     * back/front face are modified.
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index 8e0db539574..d397c95da98 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -161,6 +161,9 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
    struct svga_rasterizer_state *rast = CALLOC_STRUCT( svga_rasterizer_state );
    struct svga_screen *screen = svga_screen(pipe->screen);
 
+   if (!rast)
+      return NULL;
+
    /* need this for draw module. */
    rast->templ = *templ;
 

From b7e67b233717dddd6a1dd13fd571fee571d173bf Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 4 Apr 2016 19:39:58 -0600
Subject: [PATCH 62/72] svga: new SVGA_MSAA env var to disable/enable MSAA
 pixel formats

On by default.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/svga/svga_screen.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index ccf794ecda7..536fb6f786f 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -1000,8 +1000,10 @@ svga_screen_create(struct svga_winsys_screen *sws)
       svgascreen->max_color_buffers = SVGA3D_DX_MAX_RENDER_TARGETS;
 
       /* Multisample samples per pixel */
-      svgascreen->ms_samples =
-         get_uint_cap(sws, SVGA3D_DEVCAP_MULTISAMPLE_MASKABLESAMPLES, 0);
+      if (debug_get_bool_option("SVGA_MSAA", TRUE)) {
+         svgascreen->ms_samples =
+            get_uint_cap(sws, SVGA3D_DEVCAP_MULTISAMPLE_MASKABLESAMPLES, 0);
+      }
 
       /* Maximum number of constant buffers */
       svgascreen->max_const_buffers =

From 040f5cb09edef9b54510ca4b41c44f59a1f01fc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 12:57:51 -0500
Subject: [PATCH 63/72] util/pstipple: stronger guard against no free samplers
 (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When hasFixedUnit is false, polygon stippling will fail when there is no free
sampler available. Make the corresponding guard more robust in preparation
of raising PIPE_MAX_SAMPLERS to 32.

The literal 1 is a (signed) int, and shifting into the sign bit is undefined
in C, so change occurences of 1 to 1u.

v2: add an assert for bitfield size and use 1u << idx

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com> (v1)
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> (v1)
Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v1)
---
 src/gallium/auxiliary/util/u_pstipple.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index bcbe2a25b25..3ae8923f953 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -204,7 +204,7 @@ pstip_transform_decl(struct tgsi_transform_context *ctx,
    if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
       uint i;
       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-         pctx->samplersUsed |= 1 << i;
+         pctx->samplersUsed |= 1u << i;
       }
    }
    else if (decl->Declaration.File == pctx->wincoordFile) {
@@ -266,9 +266,11 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
    int texTemp;
    int sampIdx;
 
+   STATIC_ASSERT(sizeof(pctx->samplersUsed) * 8 >= PIPE_MAX_SAMPLERS);
+
    /* find free texture sampler */
    pctx->freeSampler = free_bit(pctx->samplersUsed);
-   if (pctx->freeSampler >= PIPE_MAX_SAMPLERS)
+   if (pctx->freeSampler < 0 || pctx->freeSampler >= PIPE_MAX_SAMPLERS)
       pctx->freeSampler = PIPE_MAX_SAMPLERS - 1;
 
    if (pctx->wincoordInput < 0)

From cc39879989a1f24c232a7d1b7037c4d3fcff2ce2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 16:27:21 -0500
Subject: [PATCH 64/72] draw/aaline: stronger guard against no free samplers
 (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Line anti-aliasing will fail when there is no free sampler available. Make
the corresponding guard more robust in preparation of raising
PIPE_MAX_SAMPLERS to 32.

The literal 1 is a (signed) int, and shifting into the sign bit is undefined
in C, so change occurences of 1 to 1u.

v2: add an assert for bitfield size and use 1u << idx

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com> (v1)
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> (v1)
Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v1)
---
 src/gallium/auxiliary/draw/draw_pipe_aaline.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index cd9ee5434d3..a5f07236e83 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -163,7 +163,7 @@ aa_transform_decl(struct tgsi_transform_context *ctx,
       uint i;
       for (i = decl->Range.First;
            i <= decl->Range.Last; i++) {
-         aactx->samplersUsed |= 1 << i;
+         aactx->samplersUsed |= 1u << i;
       }
    }
    else if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
@@ -208,9 +208,11 @@ aa_transform_prolog(struct tgsi_transform_context *ctx)
    struct aa_transform_context *aactx = (struct aa_transform_context *) ctx;
    uint i;
 
+   STATIC_ASSERT(sizeof(aactx->samplersUsed) * 8 >= PIPE_MAX_SAMPLERS);
+
    /* find free sampler */
    aactx->freeSampler = free_bit(aactx->samplersUsed);
-   if (aactx->freeSampler >= PIPE_MAX_SAMPLERS)
+   if (aactx->freeSampler < 0 || aactx->freeSampler >= PIPE_MAX_SAMPLERS)
       aactx->freeSampler = PIPE_MAX_SAMPLERS - 1;
 
    /* find two free temp regs */

From 4bfcc86bf977ac18465c7be0a0fa14354b18d7c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 7 Apr 2016 12:19:56 -0500
Subject: [PATCH 65/72] tgsi/scan: add an assert for the size of the
 samplers_declared bitfield

The literal 1 is a (signed) int, and shifting into the sign bit is undefined
in C, so change occurences of 1 to 1u.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index d90fb1d68df..c5ef16810a2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -464,7 +464,8 @@ scan_declaration(struct tgsi_shader_info *info,
             }
          }
       } else if (file == TGSI_FILE_SAMPLER) {
-         info->samplers_declared |= 1 << reg;
+         STATIC_ASSERT(sizeof(info->samplers_declared) * 8 >= PIPE_MAX_SAMPLERS);
+         info->samplers_declared |= 1u << reg;
       } else if (file == TGSI_FILE_SAMPLER_VIEW) {
          unsigned target = fulldecl->SamplerView.Resource;
          assert(target < TGSI_TEXTURE_UNKNOWN);

From 84c4d069ac7be1dece2f5eeed277089a79e6acbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 16:21:28 -0500
Subject: [PATCH 66/72] st/glsl_to_tgsi: make samplers_used an uint32_t (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is used as a bitfield, so it seems cleaner to keep it unsigned.

The literal 1 is a (signed) int, and shifting into the sign bit is undefined
in C, so change occurences of 1 to 1u.

v2: add an assert for bitfield size and use 1u << idx

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com> (v1)
Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v1)
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index cd481c166e7..b9ab7ae9919 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -389,7 +389,7 @@ public:
    unsigned num_output_arrays;
 
    int num_address_regs;
-   int samplers_used;
+   uint32_t samplers_used;
    glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
    int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
    int buffers_used;
@@ -4290,6 +4290,8 @@ glsl_to_tgsi_visitor::visit(ir_barrier *ir)
 
 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
 {
+   STATIC_ASSERT(sizeof(samplers_used) * 8 >= PIPE_MAX_SAMPLERS);
+
    result.file = PROGRAM_UNDEFINED;
    next_temp = 1;
    array_sizes = NULL;
@@ -4346,7 +4348,7 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
       if (inst->info->is_tex) {
          for (int i = 0; i < inst->sampler_array_size; i++) {
             unsigned idx = inst->sampler_base + i;
-            v->samplers_used |= 1 << idx;
+            v->samplers_used |= 1u << idx;
 
             debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
             v->sampler_types[idx] = inst->tex_type;
@@ -6325,7 +6327,7 @@ st_translate_program(
 
    /* texture samplers */
    for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
-      if (program->samplers_used & (1 << i)) {
+      if (program->samplers_used & (1u << i)) {
          unsigned type;
 
          t->samplers[i] = ureg_DECL_sampler(ureg, i);

From f09036f6c0433c37b826319e2345f88b43b3f141 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 11:51:47 -0500
Subject: [PATCH 67/72] gallium: raise PIPE_MAX_SAMPLERS to 32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous value of 18 was motivated by having drivers that want to expose
16 samplers but also use some additional samplers for internal use. Raising
the value even higher isn't going to hurt that case.

On the other hand, some drivers actually use PIPE_MAX_SAMPLERS as the number
of samplers they expose externally, so raising this number above 32 is fragile
(because several places in the code use bitfields, and tracking down and
widening all of them is prone to miss some case).

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/pipe/p_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index cb806cb6550..9e466cefd8c 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -57,7 +57,7 @@ extern "C" {
 #define PIPE_MAX_CLIP_PLANES       8
 #define PIPE_MAX_COLOR_BUFS        8
 #define PIPE_MAX_CONSTANT_BUFFERS 32
-#define PIPE_MAX_SAMPLERS         18 /* 16 public + 2 driver internal */
+#define PIPE_MAX_SAMPLERS         32
 #define PIPE_MAX_SHADER_INPUTS    80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_OUTPUTS   80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_SAMPLER_VIEWS 32

From f270067ef9c755a10d852c3e6ef4e9a01aabcb17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 11:45:37 -0500
Subject: [PATCH 68/72] radeonsi: replace magic 16 by SI_NUM_USER_SAMPLERS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 97be59d2272..41bb84d68df 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -523,7 +523,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		return 0;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-		return 16;
+		return SI_NUM_USER_SAMPLERS;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_SUPPORTED_IRS:

From 9d2693f58ad27464aabf556e7d6bd4c4eb0fd591 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 11:58:42 -0500
Subject: [PATCH 69/72] radeonsi: expand the compressed color and depth texture
 masks to 64 bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is in preparation of raising the number of exposed sampler views to 32
bits, which will raise the total number of sampler views to 33 for the
polygon stipple texture. That texture should never be compressed (and it's
certainly not a depth texture), but this approach seems cleaner to me than
special-casing the last slot in all affected code paths.

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c        | 12 +++++------
 src/gallium/drivers/radeonsi/si_descriptors.c | 20 +++++++++----------
 src/gallium/drivers/radeonsi/si_pipe.h        |  4 ++--
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index e0dbec5fb79..c5ea8b17119 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -246,14 +246,14 @@ si_flush_depth_textures(struct si_context *sctx,
 			struct si_textures_info *textures)
 {
 	unsigned i;
-	unsigned mask = textures->depth_texture_mask;
+	uint64_t mask = textures->depth_texture_mask;
 
 	while (mask) {
 		struct pipe_sampler_view *view;
 		struct si_sampler_view *sview;
 		struct r600_texture *tex;
 
-		i = u_bit_scan(&mask);
+		i = u_bit_scan64(&mask);
 
 		view = textures->views.views[i];
 		assert(view);
@@ -329,13 +329,13 @@ si_decompress_sampler_color_textures(struct si_context *sctx,
 				     struct si_textures_info *textures)
 {
 	unsigned i;
-	unsigned mask = textures->compressed_colortex_mask;
+	uint64_t mask = textures->compressed_colortex_mask;
 
 	while (mask) {
 		struct pipe_sampler_view *view;
 		struct r600_texture *tex;
 
-		i = u_bit_scan(&mask);
+		i = u_bit_scan64(&mask);
 
 		view = textures->views.views[i];
 		assert(view);
@@ -355,13 +355,13 @@ si_decompress_image_color_textures(struct si_context *sctx,
 				   struct si_images_info *images)
 {
 	unsigned i;
-	unsigned mask = images->compressed_colortex_mask;
+	uint64_t mask = images->compressed_colortex_mask;
 
 	while (mask) {
 		const struct pipe_image_view *view;
 		struct r600_texture *tex;
 
-		i = u_bit_scan(&mask);
+		i = u_bit_scan64(&mask);
 
 		view = &images->views[i];
 		assert(view->resource->target != PIPE_BUFFER);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 815b87bbd7e..6dd2e4fd89d 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -264,8 +264,8 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 		unsigned slot = start + i;
 
 		if (!views || !views[i]) {
-			samplers->depth_texture_mask &= ~(1 << slot);
-			samplers->compressed_colortex_mask &= ~(1 << slot);
+			samplers->depth_texture_mask &= ~(1llu << slot);
+			samplers->compressed_colortex_mask &= ~(1llu << slot);
 			si_set_sampler_view(sctx, &samplers->views, slot, NULL);
 			continue;
 		}
@@ -277,18 +277,18 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 				(struct r600_texture*)views[i]->texture;
 
 			if (rtex->is_depth && !rtex->is_flushing_texture) {
-				samplers->depth_texture_mask |= 1 << slot;
+				samplers->depth_texture_mask |= 1llu << slot;
 			} else {
-				samplers->depth_texture_mask &= ~(1 << slot);
+				samplers->depth_texture_mask &= ~(1llu << slot);
 			}
 			if (is_compressed_colortex(rtex)) {
-				samplers->compressed_colortex_mask |= 1 << slot;
+				samplers->compressed_colortex_mask |= 1llu << slot;
 			} else {
-				samplers->compressed_colortex_mask &= ~(1 << slot);
+				samplers->compressed_colortex_mask &= ~(1llu << slot);
 			}
 		} else {
-			samplers->depth_texture_mask &= ~(1 << slot);
-			samplers->compressed_colortex_mask &= ~(1 << slot);
+			samplers->depth_texture_mask &= ~(1llu << slot);
+			samplers->compressed_colortex_mask &= ~(1llu << slot);
 		}
 	}
 }
@@ -306,9 +306,9 @@ si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
 			struct r600_texture *rtex = (struct r600_texture *)res;
 
 			if (is_compressed_colortex(rtex)) {
-				samplers->compressed_colortex_mask |= 1 << i;
+				samplers->compressed_colortex_mask |= 1llu << i;
 			} else {
-				samplers->compressed_colortex_mask &= ~(1 << i);
+				samplers->compressed_colortex_mask &= ~(1llu << i);
 			}
 		}
 	}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6d0d687fe4c..4158fc5461e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -137,8 +137,8 @@ struct si_cs_shader_state {
 
 struct si_textures_info {
 	struct si_sampler_views		views;
-	uint32_t			depth_texture_mask; /* which textures are depth */
-	uint32_t			compressed_colortex_mask;
+	uint64_t			depth_texture_mask; /* which textures are depth */
+	uint64_t			compressed_colortex_mask;
 };
 
 struct si_images_info {

From 2abe4f8d7dcdcff75c28958e1a691ebf6cdee1ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 6 Apr 2016 12:00:08 -0500
Subject: [PATCH 70/72] radeonsi: raise number of samplers per shader to 32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94835
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index c4d6b9d9eee..bec99e15efa 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -144,10 +144,10 @@ struct si_shader_data {
 	uint32_t		sh_base[SI_NUM_SHADERS];
 };
 
-/* User sampler views:   0..15
- * Polygon stipple tex:  16
+/* User sampler views:   0..31
+ * Polygon stipple tex:  32
  */
-#define SI_NUM_USER_SAMPLERS            16 /* AKA OpenGL textures units per shader */
+#define SI_NUM_USER_SAMPLERS            32 /* AKA OpenGL textures units per shader */
 #define SI_POLY_STIPPLE_SAMPLER         SI_NUM_USER_SAMPLERS
 #define SI_NUM_SAMPLERS                 (SI_POLY_STIPPLE_SAMPLER + 1)
 

From 059308db841886101586aa3ec5ac74b89abf1a20 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 7 Apr 2016 22:38:47 +0200
Subject: [PATCH 71/72] nv50/ir: do not try to attach JOIN ops to ATOM

This might result in an INVALID_OPCODE dmesg error in case a join is
attached to an atomic operation.

Spotted with arb_shader_image_load_store-host-mem-barrier on GK104.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: mesa-stable@lists.freedesktop.org
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 66e7b2e8243..fea388685fa 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2824,7 +2824,7 @@ FlatteningPass::visit(BasicBlock *bb)
              !isSurfaceOp(insn->op) && // not confirmed
              insn->op != OP_LINTERP && // probably just nve4
              insn->op != OP_PINTERP && // probably just nve4
-             ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
+             ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
              !insn->isNop()) {
             insn->join = 1;

From 1cd19ebc4a892ada69f9085892441c00674b2764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 3 Apr 2016 03:21:47 +0200
Subject: [PATCH 72/72] radeonsi: do per-pixel clipping based on viewport
 states
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In other words, vport scissors are derived from viewport states.
If the scissor test is enabled, the intersection of both is used.

The guard band will disable clipping, so we have to clip per-pixel.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 95 ++++++++++++++++++++++---
 src/gallium/drivers/radeonsi/si_state.h |  1 +
 2 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 057458c843a..8087d2331ff 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -830,25 +830,93 @@ static void si_set_scissor_states(struct pipe_context *ctx,
 	for (i = 0; i < num_scissors; i++)
 		sctx->scissors.states[start_slot + i] = state[i];
 
+	if (!sctx->queued.named.rasterizer ||
+	    !sctx->queued.named.rasterizer->scissor_enable)
+		return;
+
 	sctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
 	si_mark_atom_dirty(sctx, &sctx->scissors.atom);
 }
 
+static void si_get_scissor_from_viewport(struct pipe_viewport_state *vp,
+					 struct pipe_scissor_state *scissor)
+{
+	/* These must be signed, unlike pipe_scissor_state. */
+	int minx, miny, maxx, maxy, tmp;
+
+	/* Convert (-1, -1) and (1, 1) from clip space into window space. */
+	minx = -vp->scale[0] + vp->translate[0];
+	miny = -vp->scale[1] + vp->translate[1];
+	maxx = vp->scale[0] + vp->translate[0];
+	maxy = vp->scale[1] + vp->translate[1];
+
+	/* r600_draw_rectangle sets this. Disable the scissor. */
+	if (minx == -1 && miny == -1 && maxx == 1 && maxy == 1) {
+		minx = miny = 0;
+		maxx = maxy = 16384;
+	}
+
+	/* Handle inverted viewports. */
+	if (minx > maxx) {
+		tmp = minx;
+		minx = maxx;
+		maxx = tmp;
+	}
+	if (miny > maxy) {
+		tmp = miny;
+		miny = maxy;
+		maxy = tmp;
+	}
+
+	scissor->minx = CLAMP(minx, 0, 16384);
+	scissor->miny = CLAMP(miny, 0, 16384);
+	scissor->maxx = CLAMP(maxx, 0, 16384);
+	scissor->maxy = CLAMP(maxy, 0, 16384);
+}
+
+static void si_clip_scissor(struct pipe_scissor_state *out,
+			    struct pipe_scissor_state *clip)
+{
+	out->minx = MAX2(out->minx, clip->minx);
+	out->miny = MAX2(out->miny, clip->miny);
+	out->maxx = MIN2(out->maxx, clip->maxx);
+	out->maxy = MIN2(out->maxy, clip->maxy);
+}
+
+static void si_emit_one_scissor(struct radeon_winsys_cs *cs,
+				struct pipe_viewport_state *vp,
+				struct pipe_scissor_state *scissor)
+{
+	struct pipe_scissor_state final;
+
+	/* Since the guard band disables clipping, we have to clip per-pixel
+	 * using a scissor.
+	 */
+	si_get_scissor_from_viewport(vp, &final);
+
+	if (scissor)
+		si_clip_scissor(&final, scissor);
+
+	radeon_emit(cs, S_028250_TL_X(final.minx) |
+			S_028250_TL_Y(final.miny) |
+			S_028250_WINDOW_OFFSET_DISABLE(1));
+	radeon_emit(cs, S_028254_BR_X(final.maxx) |
+			S_028254_BR_Y(final.maxy));
+}
+
 static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_scissor_state *states = sctx->scissors.states;
 	unsigned mask = sctx->scissors.dirty_mask;
+	bool scissor_enable = sctx->queued.named.rasterizer->scissor_enable;
 
 	/* The simple case: Only 1 viewport is active. */
 	if (mask & 1 &&
 	    !si_get_vs_info(sctx)->writes_viewport_index) {
 		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
-		radeon_emit(cs, S_028250_TL_X(states[0].minx) |
-				S_028250_TL_Y(states[0].miny) |
-				S_028250_WINDOW_OFFSET_DISABLE(1));
-		radeon_emit(cs, S_028254_BR_X(states[0].maxx) |
-				S_028254_BR_Y(states[0].maxy));
+		si_emit_one_scissor(cs, &sctx->viewports.states[0],
+				    scissor_enable ? &states[0] : NULL);
 		sctx->scissors.dirty_mask &= ~1; /* clear one bit */
 		return;
 	}
@@ -861,11 +929,8 @@ static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
 		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL +
 					       start * 4 * 2, count * 2);
 		for (i = start; i < start+count; i++) {
-			radeon_emit(cs, S_028250_TL_X(states[i].minx) |
-					S_028250_TL_Y(states[i].miny) |
-					S_028250_WINDOW_OFFSET_DISABLE(1));
-			radeon_emit(cs, S_028254_BR_X(states[i].maxx) |
-					S_028254_BR_Y(states[i].maxy));
+			si_emit_one_scissor(cs, &sctx->viewports.states[i],
+					    scissor_enable ? &states[i] : NULL);
 		}
 	}
 	sctx->scissors.dirty_mask = 0;
@@ -883,7 +948,9 @@ static void si_set_viewport_states(struct pipe_context *ctx,
 		sctx->viewports.states[start_slot + i] = state[i];
 
 	sctx->viewports.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
+	sctx->scissors.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
 	si_mark_atom_dirty(sctx, &sctx->viewports.atom);
+	si_mark_atom_dirty(sctx, &sctx->scissors.atom);
 }
 
 static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom)
@@ -980,6 +1047,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 		return NULL;
 	}
 
+	rs->scissor_enable = state->scissor;
 	rs->two_side = state->light_twoside;
 	rs->multisample_enable = state->multisample;
 	rs->force_persample_interp = state->force_persample_interp;
@@ -1038,7 +1106,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 		       S_028A48_MSAA_ENABLE(state->multisample ||
 					    state->poly_smooth ||
 					    state->line_smooth) |
-		       S_028A48_VPORT_SCISSOR_ENABLE(state->scissor));
+		       S_028A48_VPORT_SCISSOR_ENABLE(1));
 
 	si_pm4_set_reg(pm4, R_028BE4_PA_SU_VTX_CNTL,
 		       S_028BE4_PIX_CENTER(state->half_pixel_center) |
@@ -1105,6 +1173,11 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable))
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
+	if (!old_rs || old_rs->scissor_enable != rs->scissor_enable) {
+		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+		si_mark_atom_dirty(sctx, &sctx->scissors.atom);
+	}
+
 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_poly_offset_state(sctx);
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index bec99e15efa..f55f19e2918 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -68,6 +68,7 @@ struct si_state_rasterizer {
 	bool			uses_poly_offset;
 	bool			clamp_fragment_color;
 	bool			rasterizer_discard;
+	bool			scissor_enable;
 };
 
 struct si_dsa_stencil_ref_part {