From 2eed9e6b756d1e0232ad749cb89e97d535e141bd Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 12 Aug 2015 11:34:54 -0700
Subject: [PATCH 01/85] i965/gen9: Handle the GL_TEXTURE_{1D, 1D_ARRAY} targets
 inside switch

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 2955c8dcc2e..67628c96d20 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -162,9 +162,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    const unsigned align_3d_ys[] = {32, 32, 32, 16, 16};
    int i = 0;
 
-   assert(brw->gen >= 9 &&
-          mt->target != GL_TEXTURE_1D &&
-          mt->target != GL_TEXTURE_1D_ARRAY);
+   assert(brw->gen >= 9);
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
    assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
@@ -184,8 +182,10 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
       align_yf = align_3d_yf;
       align_ys = align_3d_ys;
       break;
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_1D_ARRAY:
    default:
-      unreachable("not reached");
+      unreachable("Unexpected miptree target");
    }
 
    /* Compute array index. */

From 867284a8f07b69887f8adb109fb6c71156668227 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Fri, 28 Aug 2015 08:45:11 -0400
Subject: [PATCH 02/85] st/omx/dec/h264: fix field picture type 0 poc disorder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Cc: "10.6 11.0" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/state_trackers/omx/vid_dec_h264.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c
index 18d88039579..f66ed896e62 100644
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -753,10 +753,14 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
          priv->codec_data.h264.delta_pic_order_cnt_bottom = delta_pic_order_cnt_bottom;
       }
 
-      priv->picture.h264.field_order_cnt[0] = pic_order_cnt_msb + pic_order_cnt_lsb;
-      priv->picture.h264.field_order_cnt[1] = pic_order_cnt_msb + pic_order_cnt_lsb;
-      if (!priv->picture.h264.field_pic_flag)
-         priv->picture.h264.field_order_cnt[1] += priv->codec_data.h264.delta_pic_order_cnt_bottom;
+      if (!priv->picture.h264.field_pic_flag) {
+         priv->picture.h264.field_order_cnt[0] = pic_order_cnt_msb + pic_order_cnt_lsb;
+         priv->picture.h264.field_order_cnt[1] = priv->picture.h264.field_order_cnt [0] +
+                                          priv->codec_data.h264.delta_pic_order_cnt_bottom;
+      } else if (!priv->picture.h264.bottom_field_flag)
+         priv->picture.h264.field_order_cnt[0] = pic_order_cnt_msb + pic_order_cnt_lsb;
+      else
+         priv->picture.h264.field_order_cnt[1] = pic_order_cnt_msb + pic_order_cnt_lsb;
 
    } else if (sps->pic_order_cnt_type == 1) {
       unsigned MaxFrameNum = 1 << (sps->log2_max_frame_num_minus4 + 4);

From aa9f06b3ea99b318469c3d140651f4b4986896a6 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sun, 18 Oct 2015 09:12:40 +1100
Subject: [PATCH 03/85] glsl: fix regression when building interface field name
 for SSBOs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes regression cased by bb5aeb854915ba67abc56257f830d002c956439e

We don't care about the swizzle when building the name so just skip over it.

Tested-by: Markus Wick <markus@selfnet.de>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/lower_ubo_reference.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index e818c048461..57a242b4074 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -238,6 +238,8 @@ interface_field_name(void *mem_ctx, char *base_name, ir_rvalue *d,
       case ir_type_swizzle: {
          ir_swizzle *s = (ir_swizzle *) ir;
          ir = s->val->as_dereference();
+         /* Skip swizzle in the next pass */
+         d = ir;
          break;
       }
 

From 2832ca95ecce064c7d841a3a374c2179f56161be Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sat, 17 Oct 2015 20:22:14 +1100
Subject: [PATCH 04/85] glsl: fix stream qualifier for blocks with an instance
 name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This also removes the validation from the parser as it is not required
and once arb_enhanced_layouts comes along we wont be able to do validation
on the stream qualifier in the parser anyway as it adds constant expression
support to the stream qualifier.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Cc: 11.0 <mesa-stable@lists.freedesktop.org>
---
 src/glsl/ast_to_hir.cpp                   | 26 +++++++++++++----------
 src/glsl/glsl_parser.yy                   | 11 ----------
 src/glsl/lower_named_interface_blocks.cpp |  1 +
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index ede02d94cb2..db9229f6ae3 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -6293,6 +6293,18 @@ ast_interface_block::hir(exec_list *instructions,
 
    state->struct_specifier_depth--;
 
+   for (unsigned i = 0; i < num_variables; i++) {
+      if (fields[i].stream != -1 &&
+          (unsigned) fields[i].stream != this->layout.stream) {
+         _mesa_glsl_error(&loc, state,
+                          "stream layout qualifier on "
+                          "interface block member `%s' does not match "
+                          "the interface block (%d vs %d)",
+                          fields[i].name, fields[i].stream,
+                          this->layout.stream);
+      }
+   }
+
    if (!redeclaring_per_vertex) {
       validate_identifier(this->block_name, loc, state);
 
@@ -6633,6 +6645,8 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.explicit_binding = this->layout.flags.q.explicit_binding;
          var->data.binding = this->layout.binding;
 
+         var->data.stream = this->layout.stream;
+
          state->symbols->add_variable(var);
          instructions->push_tail(var);
       }
@@ -6651,6 +6665,7 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
          var->data.patch = fields[i].patch;
+         var->data.stream = this->layout.stream;
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
@@ -6663,17 +6678,6 @@ ast_interface_block::hir(exec_list *instructions,
             var->data.matrix_layout = fields[i].matrix_layout;
          }
 
-         if (fields[i].stream != -1 &&
-             ((unsigned)fields[i].stream) != this->layout.stream) {
-            _mesa_glsl_error(&loc, state,
-                             "stream layout qualifier on "
-                             "interface block member `%s' does not match "
-                             "the interface block (%d vs %d)",
-                             var->name, fields[i].stream, this->layout.stream);
-         }
-
-         var->data.stream = this->layout.stream;
-
          if (var->data.mode == ir_var_shader_storage) {
             var->data.image_read_only = fields[i].image_read_only;
             var->data.image_write_only = fields[i].image_write_only;
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index cd00f6e085b..2f2e10d7992 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -2609,17 +2609,6 @@ interface_block:
 
       block->layout.is_default_qualifier = false;
 
-      foreach_list_typed (ast_declarator_list, member, link, &block->declarations) {
-         ast_type_qualifier& qualifier = member->type->qualifier;
-         if (qualifier.flags.q.stream && qualifier.stream != block->layout.stream) {
-               _mesa_glsl_error(& @1, state,
-                             "stream layout qualifier on "
-                             "interface block member does not match "
-                             "the interface block (%d vs %d)",
-                             qualifier.stream, block->layout.stream);
-               YYERROR;
-         }
-      }
       $$ = block;
    }
    | memory_qualifier interface_block
diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 276a2dedf47..114bb5811b4 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -186,6 +186,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
             new_var->data.centroid = iface_t->fields.structure[i].centroid;
             new_var->data.sample = iface_t->fields.structure[i].sample;
             new_var->data.patch = iface_t->fields.structure[i].patch;
+            new_var->data.stream = var->data.stream;
 
             new_var->init_interface_type(iface_t);
             hash_table_insert(interface_namespace, new_var,

From 6bd9e0351205dc475f45b58979702b5cf414aa07 Mon Sep 17 00:00:00 2001
From: Boyan Ding <boyan.j.ding@gmail.com>
Date: Fri, 16 Oct 2015 15:15:39 +0800
Subject: [PATCH 05/85] vc4: Use nir_foreach_variable

Signed-off-by: Boyan Ding <boyan.j.ding@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 2 +-
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    | 4 ++--
 src/gallium/drivers/vc4/vc4_program.c         | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index a842d604a51..17b524653bb 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -393,7 +393,7 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
                         continue;
 
                 nir_variable *output_var = NULL;
-                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                nir_foreach_variable(var, &c->s->outputs) {
                         if (var->data.driver_location == intr->const_index[0]) {
                                 output_var = var;
                                 break;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index a98d70da7d8..761e2c819c5 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -63,7 +63,7 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
         }
 
         nir_variable *input_var = NULL;
-        foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
+        nir_foreach_variable(var, &c->s->inputs) {
                 if (var->data.driver_location == intr->const_index[0]) {
                         input_var = var;
                         break;
@@ -129,7 +129,7 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                      nir_intrinsic_instr *intr)
 {
         nir_variable *output_var = NULL;
-        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+        nir_foreach_variable(var, &c->s->outputs) {
                 if (var->data.driver_location == intr->const_index[0]) {
                         output_var = var;
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 31c7e28ff57..1b590a2d0c4 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1383,13 +1383,13 @@ static void
 ntq_setup_inputs(struct vc4_compile *c)
 {
         unsigned num_entries = 0;
-        foreach_list_typed(nir_variable, var, node, &c->s->inputs)
+        nir_foreach_variable(var, &c->s->inputs)
                 num_entries++;
 
         nir_variable *vars[num_entries];
 
         unsigned i = 0;
-        foreach_list_typed(nir_variable, var, node, &c->s->inputs)
+        nir_foreach_variable(var, &c->s->inputs)
                 vars[i++] = var;
 
         /* Sort the variables so that we emit the input setup in
@@ -1432,7 +1432,7 @@ ntq_setup_inputs(struct vc4_compile *c)
 static void
 ntq_setup_outputs(struct vc4_compile *c)
 {
-        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+        nir_foreach_variable(var, &c->s->outputs) {
                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
                 unsigned loc = var->data.driver_location * 4;
 
@@ -1471,7 +1471,7 @@ ntq_setup_outputs(struct vc4_compile *c)
 static void
 ntq_setup_uniforms(struct vc4_compile *c)
 {
-        foreach_list_typed(nir_variable, var, node, &c->s->uniforms) {
+        nir_foreach_variable(var, &c->s->uniforms) {
                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
                 unsigned array_elem_size = 4 * sizeof(float);
 

From 12321966aec5e635c51208f409737dd1ddc3c883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 17 Mar 2015 14:46:04 +0100
Subject: [PATCH 06/85] radeonsi: add support for ARB_texture_view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All tests pass. We don't need to do much - just set CUBE if the view
target is CUBE or CUBE_ARRAY, otherwise set the resource target.

The reason this can be so simple is that texture instructions
have a greater effect on the target than the sampler view.

Thanks Glenn for the piglit test.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 docs/GL3.txt                            |  2 +-
 docs/relnotes/11.1.0.html               |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c  |  2 +-
 src/gallium/drivers/radeonsi/si_state.c | 27 +++++++++++++++++++------
 4 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 6503e2ab1da..167321676df 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -169,7 +169,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
   GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
   GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, llvmpipe, softpipe)
+  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_vertex_attrib_binding                         DONE (all drivers)
 
 
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index dcf425e4c68..d3dbe9dda13 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -51,6 +51,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
+<li>GL_ARB_texture_view on radeonsi</li>
 <li>EGL_KHR_create_context on softpipe, llvmpipe</li>
 <li>EGL_KHR_gl_colorspace on softpipe, llvmpipe</li>
 </ul>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 53c80dba602..6be78afe4a9 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -294,6 +294,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
+	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_TEXTURE_QUERY_LOD:
 	case PIPE_CAP_TEXTURE_GATHER_SM5:
 	case PIPE_CAP_TGSI_TXQS:
@@ -335,7 +336,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 		return 0;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index e6475364f98..2e77a3605a6 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1535,9 +1535,14 @@ static unsigned si_tex_compare(unsigned compare)
 	}
 }
 
-static unsigned si_tex_dim(unsigned dim, unsigned nr_samples)
+static unsigned si_tex_dim(unsigned res_target, unsigned view_target,
+			   unsigned nr_samples)
 {
-	switch (dim) {
+	if (view_target == PIPE_TEXTURE_CUBE ||
+	    view_target == PIPE_TEXTURE_CUBE_ARRAY)
+		res_target = view_target;
+
+	switch (res_target) {
 	default:
 	case PIPE_TEXTURE_1D:
 		return V_008F1C_SQ_RSRC_IMG_1D;
@@ -2391,6 +2396,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	struct radeon_surf_level *surflevel;
 	int first_non_void;
 	uint64_t va;
+	unsigned last_layer = state->u.tex.last_layer;
 
 	if (view == NULL)
 		return NULL;
@@ -2596,6 +2602,13 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
+	/* This is not needed if state trackers set last_layer correctly. */
+	if (state->target == PIPE_TEXTURE_1D ||
+	    state->target == PIPE_TEXTURE_2D ||
+	    state->target == PIPE_TEXTURE_RECT ||
+	    state->target == PIPE_TEXTURE_CUBE)
+		last_layer = state->u.tex.first_layer;
+
 	va = tmp->resource.gpu_address + surflevel[base_level].offset;
 
 	view->state[0] = va >> 8;
@@ -2615,10 +2628,11 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 						      last_level) |
 			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) |
 			  S_008F1C_POW2_PAD(texture->last_level > 0) |
-			  S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples)));
+			  S_008F1C_TYPE(si_tex_dim(texture->target, state->target,
+						   texture->nr_samples)));
 	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
 	view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
-			  S_008F24_LAST_ARRAY(state->u.tex.last_layer));
+			  S_008F24_LAST_ARRAY(last_layer));
 	view->state[6] = 0;
 	view->state[7] = 0;
 
@@ -2653,11 +2667,12 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 				       S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
 				       S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
 				       S_008F1C_TILING_INDEX(tmp->fmask.tile_mode_index) |
-				       S_008F1C_TYPE(si_tex_dim(texture->target, 0));
+				       S_008F1C_TYPE(si_tex_dim(texture->target,
+								state->target, 0));
 		view->fmask_state[4] = S_008F20_DEPTH(depth - 1) |
 				       S_008F20_PITCH(tmp->fmask.pitch - 1);
 		view->fmask_state[5] = S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
-				       S_008F24_LAST_ARRAY(state->u.tex.last_layer);
+				       S_008F24_LAST_ARRAY(last_layer);
 		view->fmask_state[6] = 0;
 		view->fmask_state[7] = 0;
 	}

From d74e7b6fb9dca5622c17413821d4cfcc67472e76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Sep 2015 21:02:15 +0200
Subject: [PATCH 07/85] gallium: add PIPE_CAP_SHAREABLE_SHADERS

I'll let drivers figure out how to do it.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/docs/source/screen.rst               | 2 ++
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/include/pipe/p_defines.h             | 1 +
 15 files changed, 16 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index e08844b2f0b..72f7596886d 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -276,6 +276,8 @@ The integer capabilities:
   GL4 hardware will likely need to emulate it with a shader variant, or by
   selecting the interpolation weights with a conditional assignment
   in the shader.
+* ``PIPE_CAP_SHAREABLE_SHADERS``: Whether shader CSOs can be used by any
+  pipe_context.
 
 
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b64f78ca32b..f85e4586413 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -237,6 +237,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 9d6b3d39183..c91408d3d9b 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -249,6 +249,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 76812a666a0..acf688fc02c 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -471,6 +471,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 50c3781f5f8..e2ed267da78 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -298,6 +298,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 335c163b661..d4cf143b9a3 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -171,6 +171,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 812b246ea0e..a4431f20e14 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -216,6 +216,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index f34ad0ed5d1..d34c8a2b07b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -202,6 +202,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 1165ac8a9c0..c1c522b0a3a 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -197,6 +197,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_DEPTH_BOUNDS_TEST:
         case PIPE_CAP_TGSI_TXQS:
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_SHAREABLE_SHADERS:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 32ce76a9e07..75de553be2b 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -343,6 +343,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 6be78afe4a9..37e793a2204 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -337,6 +337,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index d468cf4de54..e7006d2fa0d 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -248,6 +248,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index dab89814334..9bf661fab8c 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -381,6 +381,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
    }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 739ac86193a..3b12464a2f6 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -182,6 +182,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_SHAREABLE_SHADERS:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index a4947154f17..3a1265dcc22 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -632,6 +632,7 @@ enum pipe_cap
    PIPE_CAP_DEPTH_BOUNDS_TEST,
    PIPE_CAP_TGSI_TXQS,
    PIPE_CAP_FORCE_PERSAMPLE_INTERP,
+   PIPE_CAP_SHAREABLE_SHADERS,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)

From f4e938e9aee14d42e2175c84ebe7ee32b0bcf8c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Sep 2015 23:36:59 +0200
Subject: [PATCH 08/85] st/mesa: decouple shaders from contexts if they are
 shareable

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/state_tracker/st_atom_shader.c   | 10 +++++-----
 src/mesa/state_tracker/st_cb_bitmap.c     |  2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c |  2 +-
 src/mesa/state_tracker/st_context.c       |  3 ++-
 src/mesa/state_tracker/st_context.h       |  1 +
 src/mesa/state_tracker/st_program.c       | 16 +++++++++++-----
 6 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 1e880a107c0..394145409b3 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -64,7 +64,7 @@ update_fp( struct st_context *st )
    assert(stfp->Base.Base.Target == GL_FRAGMENT_PROGRAM_ARB);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    /* _NEW_FRAG_CLAMP */
    key.clamp_color = st->clamp_frag_color_in_shader &&
@@ -119,7 +119,7 @@ update_vp( struct st_context *st )
    assert(stvp->Base.Base.Target == GL_VERTEX_PROGRAM_ARB);
 
    memset(&key, 0, sizeof key);
-   key.st = st;  /* variants are per-context */
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    /* When this is true, we will add an extra input to the vertex
     * shader translation (for edgeflags), an extra output with
@@ -174,7 +174,7 @@ update_gp( struct st_context *st )
    assert(stgp->Base.Base.Target == GL_GEOMETRY_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    st->gp_variant = st_get_gp_variant(st, stgp, &key);
 
@@ -210,7 +210,7 @@ update_tcp( struct st_context *st )
    assert(sttcp->Base.Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    st->tcp_variant = st_get_tcp_variant(st, sttcp, &key);
 
@@ -246,7 +246,7 @@ update_tep( struct st_context *st )
    assert(sttep->Base.Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    st->tep_variant = st_get_tep_variant(st, sttep, &key);
 
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index bb6dfe85644..cbc6845d771 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -269,7 +269,7 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    struct pipe_resource *vbuf = NULL;
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
    key.bitmap = GL_TRUE;
    key.clamp_color = st->clamp_frag_color_in_shader &&
                      st->ctx->Color._ClampFragmentColor;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 7e8633edc1a..20cbfdefd23 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -914,7 +914,7 @@ get_color_fp_variant(struct st_context *st)
 
    memset(&key, 0, sizeof(key));
 
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
    key.drawpixels = 1;
    key.scaleAndBias = (ctx->Pixel.RedBias != 0.0 ||
                        ctx->Pixel.RedScale != 1.0 ||
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index bef7307bb27..6256c0b0d82 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -237,7 +237,8 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
                               PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER);
    st->can_force_persample_interp = screen->get_param(screen,
                                           PIPE_CAP_FORCE_PERSAMPLE_INTERP);
-
+   st->has_shareable_shaders = screen->get_param(screen,
+                                                 PIPE_CAP_SHAREABLE_SHADERS);
    st->needs_texcoord_semantic =
       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
    st->apply_texture_swizzle_to_border_color =
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index f187d82449b..446fe5de889 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -99,6 +99,7 @@ struct st_context
    boolean has_etc2;
    boolean prefer_blit_based_texture_transfer;
    boolean can_force_persample_interp;
+   boolean has_shareable_shaders;
 
    boolean needs_texcoord_semantic;
    boolean apply_texture_swizzle_to_border_color;
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 6a69ba7aa26..87571a88e78 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -1728,6 +1728,12 @@ destroy_program_variants_cb(GLuint key, void *data, void *userData)
 void
 st_destroy_program_variants(struct st_context *st)
 {
+   /* If shaders can be shared with other contexts, the last context will
+    * call DeleteProgram on all shaders, releasing everything.
+    */
+   if (st->has_shareable_shaders)
+      return;
+
    /* ARB vert/frag program */
    _mesa_HashWalk(st->ctx->Shared->Programs,
                   destroy_program_variants_cb, st);
@@ -1774,7 +1780,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_vp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_vp_variant(st, p, &key);
       break;
    }
@@ -1784,7 +1790,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_tcp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_tcp_variant(st, p, &key);
       break;
    }
@@ -1794,7 +1800,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_tep_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_tep_variant(st, p, &key);
       break;
    }
@@ -1804,7 +1810,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_gp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_gp_variant(st, p, &key);
       break;
    }
@@ -1814,7 +1820,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_fp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_fp_variant(st, p, &key);
       break;
    }

From b99645f8190b267231443829aefad1e73c4c25d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 20 Oct 2015 00:12:53 +0200
Subject: [PATCH 09/85] st/mesa: negate the can_force_persample_interp flag

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/state_tracker/st_atom_rasterizer.c | 2 +-
 src/mesa/state_tracker/st_atom_shader.c     | 2 +-
 src/mesa/state_tracker/st_context.c         | 5 +++--
 src/mesa/state_tracker/st_context.h         | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 0f01e9939de..55d5e66243c 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -239,7 +239,7 @@ static void update_raster_state( struct st_context *st )
 
    /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    raster->force_persample_interp =
-         st->can_force_persample_interp &&
+         !st->force_persample_in_shader &&
          ctx->Multisample._Enabled &&
          ctx->Multisample.SampleShading &&
          ctx->Multisample.MinSampleShadingValue *
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 394145409b3..0f9ea101889 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -76,7 +76,7 @@ update_fp( struct st_context *st )
     * Ignore sample qualifier while computing this flag.
     */
    key.persample_shading =
-      !st->can_force_persample_interp &&
+      st->force_persample_in_shader &&
       !(stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID |
                                             SYSTEM_BIT_SAMPLE_POS)) &&
       _mesa_get_min_invocations_per_fragment(st->ctx, &stfp->Base, true) > 1;
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 6256c0b0d82..70e006912dc 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -235,8 +235,9 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
                                               PIPE_BIND_SAMPLER_VIEW);
    st->prefer_blit_based_texture_transfer = screen->get_param(screen,
                               PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER);
-   st->can_force_persample_interp = screen->get_param(screen,
-                                          PIPE_CAP_FORCE_PERSAMPLE_INTERP);
+   st->force_persample_in_shader =
+      screen->get_param(screen, PIPE_CAP_SAMPLE_SHADING) &&
+      !screen->get_param(screen, PIPE_CAP_FORCE_PERSAMPLE_INTERP);
    st->has_shareable_shaders = screen->get_param(screen,
                                                  PIPE_CAP_SHAREABLE_SHADERS);
    st->needs_texcoord_semantic =
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 446fe5de889..ec95259b67a 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -98,7 +98,7 @@ struct st_context
    boolean has_etc1;
    boolean has_etc2;
    boolean prefer_blit_based_texture_transfer;
-   boolean can_force_persample_interp;
+   boolean force_persample_in_shader;
    boolean has_shareable_shaders;
 
    boolean needs_texcoord_semantic;

From e57dd7a08bfeacab47d64c3adeb392f8c15ca793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 28 Sep 2015 00:04:39 +0200
Subject: [PATCH 10/85] st/mesa: create shaders which have only one variant
 immediatelly (v2)

v2: fix the condition when lacking sample shading

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/state_tracker/st_cb_program.c |  5 +++--
 src/mesa/state_tracker/st_context.c    | 14 ++++++++++++++
 src/mesa/state_tracker/st_context.h    |  7 +++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 708bdf5011e..2c4eccf1e06 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -224,6 +224,7 @@ st_program_string_notify( struct gl_context *ctx,
                                            struct gl_program *prog )
 {
    struct st_context *st = st_context(ctx);
+   gl_shader_stage stage = _mesa_program_enum_to_shader_stage(target);
 
    if (target == GL_FRAGMENT_PROGRAM_ARB) {
       struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
@@ -278,10 +279,10 @@ st_program_string_notify( struct gl_context *ctx,
          st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
    }
 
-   if (ST_DEBUG & DEBUG_PRECOMPILE)
+   if (ST_DEBUG & DEBUG_PRECOMPILE ||
+       st->shader_has_one_variant[stage])
       st_precompile_shader_variant(st, prog);
 
-   /* XXX check if program is legal, within limits */
    return GL_TRUE;
 }
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 70e006912dc..5abb17385c2 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -294,6 +294,20 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
          ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true;
    }
 
+   /* Set which shader types can be compiled at link time. */
+   st->shader_has_one_variant[MESA_SHADER_VERTEX] =
+         st->has_shareable_shaders &&
+         !st->clamp_vert_color_in_shader;
+
+   st->shader_has_one_variant[MESA_SHADER_FRAGMENT] =
+         st->has_shareable_shaders &&
+         !st->clamp_frag_color_in_shader &&
+         !st->force_persample_in_shader;
+
+   st->shader_has_one_variant[MESA_SHADER_TESS_CTRL] = st->has_shareable_shaders;
+   st->shader_has_one_variant[MESA_SHADER_TESS_EVAL] = st->has_shareable_shaders;
+   st->shader_has_one_variant[MESA_SHADER_GEOMETRY] = st->has_shareable_shaders;
+
    _mesa_compute_version(ctx);
 
    if (ctx->Version == 0) {
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index ec95259b67a..c243f5cd966 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -101,6 +101,13 @@ struct st_context
    boolean force_persample_in_shader;
    boolean has_shareable_shaders;
 
+   /**
+    * If a shader can be created when we get its source.
+    * This means it has only 1 variant, not counting glBitmap and
+    * glDrawPixels.
+    */
+   boolean shader_has_one_variant[MESA_SHADER_STAGES];
+
    boolean needs_texcoord_semantic;
    boolean apply_texture_swizzle_to_border_color;
 

From 9b54ce3362f117b4d46497b578211bb26554dd78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Wed, 7 Oct 2015 01:48:18 +0200
Subject: [PATCH 11/85] radeonsi: support thread-safe shaders shared by
 multiple contexts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "current" shader pointer is moved from the CSO to the context, so that
the CSO is mostly immutable.

The only drawback is that the "current" pointer isn't saved when unbinding
a shader and it must be looked up when the shader is bound again.

This is also a prerequisite for multithreaded shader compilation.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c        |  10 +-
 src/gallium/drivers/radeonsi/si_debug.c       |  18 +-
 src/gallium/drivers/radeonsi/si_descriptors.c |  12 +-
 src/gallium/drivers/radeonsi/si_pipe.c        |   6 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |  21 +-
 src/gallium/drivers/radeonsi/si_shader.h      |  31 +-
 src/gallium/drivers/radeonsi/si_state.c       |   2 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |  44 +--
 .../drivers/radeonsi/si_state_shaders.c       | 281 +++++++++---------
 9 files changed, 225 insertions(+), 200 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index d5c5db30029..082ea850675 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -55,11 +55,11 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 	util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
 	util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
 	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader);
-	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader);
-	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader);
-	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader);
-	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader);
+	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
+	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
+	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
 	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
 	util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
 	util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index 7d41e8d00e0..53062187b88 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -31,15 +31,15 @@
 #include "ddebug/dd_util.h"
 
 
-static void si_dump_shader(struct si_shader_selector *sel, const char *name,
+static void si_dump_shader(struct si_shader_ctx_state *state, const char *name,
 			   FILE *f)
 {
-	if (!sel || !sel->current)
+	if (!state->cso || !state->current)
 		return;
 
 	fprintf(f, "%s shader disassembly:\n", name);
-	si_dump_shader_key(sel->type, &sel->current->key, f);
-	fprintf(f, "%s\n\n", sel->current->binary.disasm_string);
+	si_dump_shader_key(state->cso->type, &state->current->key, f);
+	fprintf(f, "%s\n\n", state->current->binary.disasm_string);
 }
 
 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
@@ -536,11 +536,11 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
 	if (flags & PIPE_DEBUG_DEVICE_IS_HUNG)
 		si_dump_debug_registers(sctx, f);
 
-	si_dump_shader(sctx->vs_shader, "Vertex", f);
-	si_dump_shader(sctx->tcs_shader, "Tessellation control", f);
-	si_dump_shader(sctx->tes_shader, "Tessellation evaluation", f);
-	si_dump_shader(sctx->gs_shader, "Geometry", f);
-	si_dump_shader(sctx->ps_shader, "Fragment", f);
+	si_dump_shader(&sctx->vs_shader, "Vertex", f);
+	si_dump_shader(&sctx->tcs_shader, "Tessellation control", f);
+	si_dump_shader(&sctx->tes_shader, "Tessellation evaluation", f);
+	si_dump_shader(&sctx->gs_shader, "Geometry", f);
+	si_dump_shader(&sctx->ps_shader, "Fragment", f);
 
 	si_dump_last_bo_list(sctx, f);
 	si_dump_last_ib(sctx, f);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 19dd14f9b6f..13738da5e2c 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -915,10 +915,10 @@ static void si_set_user_data_base(struct si_context *sctx,
 void si_shader_change_notify(struct si_context *sctx)
 {
 	/* VS can be bound as VS, ES, or LS. */
-	if (sctx->tes_shader)
+	if (sctx->tes_shader.cso)
 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
 				      R_00B530_SPI_SHADER_USER_DATA_LS_0);
-	else if (sctx->gs_shader)
+	else if (sctx->gs_shader.cso)
 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
 				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
 	else
@@ -926,8 +926,8 @@ void si_shader_change_notify(struct si_context *sctx)
 				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
 
 	/* TES can be bound as ES, VS, or not bound. */
-	if (sctx->tes_shader) {
-		if (sctx->gs_shader)
+	if (sctx->tes_shader.cso) {
+		if (sctx->gs_shader.cso)
 			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
 					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
 		else
@@ -964,7 +964,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 	unsigned i;
 	uint32_t *sh_base = sctx->shader_userdata.sh_base;
 
-	if (sctx->gs_shader) {
+	if (sctx->gs_shader.cso) {
 		/* The VS copy shader needs these for clipping, streamout, and rings. */
 		unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
 		unsigned i = PIPE_SHADER_VERTEX;
@@ -975,7 +975,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 		/* The TESSEVAL shader needs this for the ESGS ring buffer. */
 		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
 				       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
-	} else if (sctx->tes_shader) {
+	} else if (sctx->tes_shader.cso) {
 		/* The TESSEVAL shader needs this for streamout. */
 		si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
 				       R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 37e793a2204..c084f03cd25 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -57,8 +57,8 @@ static void si_destroy_context(struct pipe_context *context)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
 	if (sctx->dummy_pixel_shader)
 		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
-	if (sctx->fixed_func_tcs_shader)
-		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader);
+	if (sctx->fixed_func_tcs_shader.cso)
+		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader.cso);
 	if (sctx->custom_dsa_flush)
 		sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
 	if (sctx->custom_blend_resolve)
@@ -293,6 +293,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_TEXTURE_QUERY_LOD:
@@ -337,7 +338,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_SHAREABLE_SHADERS:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 2abd5b5a0c3..d7a2282952a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -152,6 +152,15 @@ struct si_viewports {
 	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
 };
 
+/* A shader state consists of the shader selector, which is a constant state
+ * object shared by multiple contexts and shouldn't be modified, and
+ * the current shader variant selected for this context.
+ */
+struct si_shader_ctx_state {
+	struct si_shader_selector	*cso;
+	struct si_shader		*current;
+};
+
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -162,7 +171,7 @@ struct si_context {
 	void				*pstipple_sampler_state;
 	struct si_screen		*screen;
 	struct pipe_fence_handle	*last_gfx_fence;
-	struct si_shader_selector	*fixed_func_tcs_shader;
+	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	LLVMTargetMachineRef		tm;
 
 	/* Atoms (direct states). */
@@ -199,11 +208,11 @@ struct si_context {
 	void				*dummy_pixel_shader;
 
 	/* shaders */
-	struct si_shader_selector	*ps_shader;
-	struct si_shader_selector	*gs_shader;
-	struct si_shader_selector	*vs_shader;
-	struct si_shader_selector	*tcs_shader;
-	struct si_shader_selector	*tes_shader;
+	struct si_shader_ctx_state	ps_shader;
+	struct si_shader_ctx_state	gs_shader;
+	struct si_shader_ctx_state	vs_shader;
+	struct si_shader_ctx_state	tcs_shader;
+	struct si_shader_ctx_state	tes_shader;
 	struct si_cs_shader_state	cs_shader_state;
 
 	/* shader information */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 54dad726d01..b1076ed9183 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -179,15 +179,18 @@ struct radeon_shader_reloc;
 
 struct si_shader;
 
+/* A shader selector is a gallium CSO and contains shader variants and
+ * binaries for one TGSI program. This can be shared by multiple contexts.
+ */
 struct si_shader_selector {
-	struct si_shader *current;
+	pipe_mutex		mutex;
+	struct si_shader	*first_variant; /* immutable after the first variant */
+	struct si_shader	*last_variant; /* mutable */
 
 	struct tgsi_token       *tokens;
 	struct pipe_stream_output_info  so;
 	struct tgsi_shader_info		info;
 
-	unsigned	num_shaders;
-
 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;
 
@@ -293,24 +296,24 @@ struct si_shader {
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
 {
-	if (sctx->gs_shader)
-		return &sctx->gs_shader->info;
-	else if (sctx->tes_shader)
-		return &sctx->tes_shader->info;
-	else if (sctx->vs_shader)
-		return &sctx->vs_shader->info;
+	if (sctx->gs_shader.cso)
+		return &sctx->gs_shader.cso->info;
+	else if (sctx->tes_shader.cso)
+		return &sctx->tes_shader.cso->info;
+	else if (sctx->vs_shader.cso)
+		return &sctx->vs_shader.cso->info;
 	else
 		return NULL;
 }
 
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
-	if (sctx->gs_shader)
-		return sctx->gs_shader->current->gs_copy_shader;
-	else if (sctx->tes_shader)
-		return sctx->tes_shader->current;
+	if (sctx->gs_shader.current)
+		return sctx->gs_shader.current->gs_copy_shader;
+	else if (sctx->tes_shader.current)
+		return sctx->tes_shader.current;
 	else
-		return sctx->vs_shader->current;
+		return sctx->vs_shader.current;
 }
 
 static inline bool si_vs_exports_prim_id(struct si_shader *shader)
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 2e77a3605a6..243bdc6e6d7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -266,7 +266,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
 	if (blend->dual_src_blend &&
-	    (sctx->ps_shader->ps_colors_written & 0x3) != 0x3)
+	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
 		mask = 0;
 
 	radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, mask);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 5face423941..ce6c98c3124 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -109,11 +109,11 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 				       unsigned *num_patches)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	struct si_shader_selector *ls = sctx->vs_shader;
+	struct si_shader_ctx_state *ls = &sctx->vs_shader;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
 	struct si_shader_selector *tcs =
-		sctx->tcs_shader ? sctx->tcs_shader : sctx->tes_shader;
+		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
 	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
 	unsigned num_tcs_input_cp = info->vertices_per_patch;
 	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
@@ -138,9 +138,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 
 	/* This calculates how shader inputs and outputs among VS, TCS, and TES
 	 * are laid out in LDS. */
-	num_tcs_inputs = util_last_bit64(ls->outputs_written);
+	num_tcs_inputs = util_last_bit64(ls->cso->outputs_written);
 
-	if (sctx->tcs_shader) {
+	if (sctx->tcs_shader.cso) {
 		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
 		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
@@ -159,7 +159,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
 	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 
-	output_patch0_offset = sctx->tcs_shader ? input_patch_size * *num_patches : 0;
+	output_patch0_offset = sctx->tcs_shader.cso ? input_patch_size * *num_patches : 0;
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
@@ -231,13 +231,13 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	bool partial_vs_wave = false;
 	bool partial_es_wave = false;
 
-	if (sctx->gs_shader)
+	if (sctx->gs_shader.cso)
 		primgroup_size = 64; /* recommended with a GS */
 
-	if (sctx->tes_shader) {
+	if (sctx->tes_shader.cso) {
 		unsigned num_cp_out =
-			sctx->tcs_shader ?
-			sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+			sctx->tcs_shader.cso ?
+			sctx->tcs_shader.cso->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
 			info->vertices_per_patch;
 		unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out);
 
@@ -248,8 +248,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 
 		/* SWITCH_ON_EOI must be set if PrimID is used.
 		 * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
-		if ((sctx->tcs_shader && sctx->tcs_shader->info.uses_primid) ||
-		    sctx->tes_shader->info.uses_primid) {
+		if ((sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
+		    sctx->tes_shader.cso->info.uses_primid) {
 			ia_switch_on_eoi = true;
 			partial_es_wave = true;
 		}
@@ -258,7 +258,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		if ((sctx->b.family == CHIP_TAHITI ||
 		     sctx->b.family == CHIP_PITCAIRN ||
 		     sctx->b.family == CHIP_BONAIRE) &&
-		    sctx->gs_shader)
+		    sctx->gs_shader.cso)
 			partial_vs_wave = true;
 	}
 
@@ -328,11 +328,11 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
 {
 	unsigned num_output_cp;
 
-	if (!sctx->tes_shader)
+	if (!sctx->tes_shader.cso)
 		return 0;
 
-	num_output_cp = sctx->tcs_shader ?
-		sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+	num_output_cp = sctx->tcs_shader.cso ?
+		sctx->tcs_shader.cso->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
 		info->vertices_per_patch;
 
 	return S_028B58_NUM_PATCHES(num_patches) |
@@ -395,7 +395,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
 	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
 
-	if (sctx->tes_shader)
+	if (sctx->tes_shader.cso)
 		si_emit_derived_tess_state(sctx, info, &num_patches);
 
 	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
@@ -735,11 +735,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    (info->indexed || !info->count_from_stream_output))
 		return;
 
-	if (!sctx->ps_shader || !sctx->vs_shader) {
+	if (!sctx->ps_shader.cso || !sctx->vs_shader.cso) {
 		assert(0);
 		return;
 	}
-	if (!!sctx->tes_shader != (info->mode == PIPE_PRIM_PATCHES)) {
+	if (!!sctx->tes_shader.cso != (info->mode == PIPE_PRIM_PATCHES)) {
 		assert(0);
 		return;
 	}
@@ -751,11 +751,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	 * This must be done after si_decompress_textures, which can call
 	 * draw_vbo recursively, and before si_update_shaders, which uses
 	 * current_rast_prim for this draw_vbo call. */
-	if (sctx->gs_shader)
-		sctx->current_rast_prim = sctx->gs_shader->gs_output_prim;
-	else if (sctx->tes_shader)
+	if (sctx->gs_shader.cso)
+		sctx->current_rast_prim = sctx->gs_shader.cso->gs_output_prim;
+	else if (sctx->tes_shader.cso)
 		sctx->current_rast_prim =
-			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		sctx->current_rast_prim = info->mode;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index c98509bb0b9..8b26b943e00 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -523,26 +523,26 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				key->vs.instance_divisors[i] =
 					sctx->vertex_elements->elements[i].instance_divisor;
 
-		if (sctx->tes_shader)
+		if (sctx->tes_shader.cso)
 			key->vs.as_ls = 1;
-		else if (sctx->gs_shader) {
+		else if (sctx->gs_shader.cso) {
 			key->vs.as_es = 1;
-			key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read;
+			key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
 		}
 
-		if (!sctx->gs_shader && sctx->ps_shader &&
-		    sctx->ps_shader->info.uses_primid)
+		if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
+		    sctx->ps_shader.cso->info.uses_primid)
 			key->vs.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		key->tcs.prim_mode =
-			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		break;
 	case PIPE_SHADER_TESS_EVAL:
-		if (sctx->gs_shader) {
+		if (sctx->gs_shader.cso) {
 			key->tes.as_es = 1;
-			key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read;
-		} else if (sctx->ps_shader && sctx->ps_shader->info.uses_primid)
+			key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
+		} else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
@@ -589,11 +589,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 /* Select the hw shader variant depending on the current state. */
 static int si_shader_select(struct pipe_context *ctx,
-			    struct si_shader_selector *sel)
+			    struct si_shader_ctx_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state->cso;
+	struct si_shader *current = state->current;
 	union si_shader_key key;
-	struct si_shader * shader = NULL;
+	struct si_shader *iter, *shader = NULL;
 	int r;
 
 	si_shader_selector_key(ctx, sel, &key);
@@ -602,49 +604,51 @@ static int si_shader_select(struct pipe_context *ctx,
 	 * This path is also used for most shaders that don't need multiple
 	 * variants, it will cost just a computation of the key and this
 	 * test. */
-	if (likely(sel->current && memcmp(&sel->current->key, &key, sizeof(key)) == 0)) {
+	if (likely(current && memcmp(&current->key, &key, sizeof(key)) == 0))
 		return 0;
-	}
 
-	/* lookup if we have other variants in the list */
-	if (sel->num_shaders > 1) {
-		struct si_shader *p = sel->current, *c = p->next_variant;
+	pipe_mutex_lock(sel->mutex);
 
-		while (c && memcmp(&c->key, &key, sizeof(key)) != 0) {
-			p = c;
-			c = c->next_variant;
-		}
-
-		if (c) {
-			p->next_variant = c->next_variant;
-			shader = c;
+	/* Find the shader variant. */
+	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
+		/* Don't check the "current" shader. We checked it above. */
+		if (current != iter &&
+		    memcmp(&iter->key, &key, sizeof(key)) == 0) {
+			state->current = iter;
+			pipe_mutex_unlock(sel->mutex);
+			return 0;
 		}
 	}
 
-	if (shader) {
-		shader->next_variant = sel->current;
-		sel->current = shader;
+	/* Build a new shader. */
+	shader = CALLOC_STRUCT(si_shader);
+	if (!shader) {
+		pipe_mutex_unlock(sel->mutex);
+		return -ENOMEM;
+	}
+	shader->selector = sel;
+	shader->key = key;
+
+	r = si_shader_create(sctx->screen, sctx->tm, shader);
+	if (unlikely(r)) {
+		R600_ERR("Failed to build shader variant (type=%u) %d\n",
+			 sel->type, r);
+		FREE(shader);
+		pipe_mutex_unlock(sel->mutex);
+		return r;
+	}
+	si_shader_init_pm4_state(shader);
+
+	if (!sel->last_variant) {
+		sel->first_variant = shader;
+		sel->last_variant = shader;
 	} else {
-		shader = CALLOC(1, sizeof(struct si_shader));
-		shader->selector = sel;
-		shader->key = key;
-
-		shader->next_variant = sel->current;
-		sel->current = shader;
-		r = si_shader_create((struct si_screen*)ctx->screen, sctx->tm,
-				     shader);
-		if (unlikely(r)) {
-			R600_ERR("Failed to build shader variant (type=%u) %d\n",
-				 sel->type, r);
-			sel->current = NULL;
-			FREE(shader);
-			return r;
-		}
-		si_shader_init_pm4_state(shader);
-		sel->num_shaders++;
-		p_atomic_inc(&sctx->screen->b.num_compilations);
+		sel->last_variant->next_variant = shader;
+		sel->last_variant = shader;
 	}
-
+	state->current = shader;
+	p_atomic_inc(&sctx->screen->b.num_compilations);
+	pipe_mutex_unlock(sel->mutex);
 	return 0;
 }
 
@@ -752,14 +756,18 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
-	if (sscreen->b.debug_flags & DBG_PRECOMPILE)
-		if (si_shader_select(ctx, sel)) {
+	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
+		struct si_shader_ctx_state state = {sel};
+
+		if (si_shader_select(ctx, &state)) {
 			fprintf(stderr, "radeonsi: can't create a shader\n");
 			tgsi_free_tokens(sel->tokens);
 			FREE(sel);
 			return NULL;
 		}
+	}
 
+	pipe_mutex_init(sel->mutex);
 	return sel;
 }
 
@@ -787,10 +795,11 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
 
-	if (sctx->vs_shader == sel || !sel)
+	if (sctx->vs_shader.cso == sel || !sel)
 		return;
 
-	sctx->vs_shader = sel;
+	sctx->vs_shader.cso = sel;
+	sctx->vs_shader.current = sel->first_variant;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	si_update_viewports_and_scissors(sctx);
 }
@@ -799,12 +808,13 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->gs_shader != !!sel;
+	bool enable_changed = !!sctx->gs_shader.cso != !!sel;
 
-	if (sctx->gs_shader == sel)
+	if (sctx->gs_shader.cso == sel)
 		return;
 
-	sctx->gs_shader = sel;
+	sctx->gs_shader.cso = sel;
+	sctx->gs_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
@@ -817,12 +827,13 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->tcs_shader != !!sel;
+	bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
 
-	if (sctx->tcs_shader == sel)
+	if (sctx->tcs_shader.cso == sel)
 		return;
 
-	sctx->tcs_shader = sel;
+	sctx->tcs_shader.cso = sel;
+	sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
 
 	if (enable_changed)
 		sctx->last_tcs = NULL; /* invalidate derived tess state */
@@ -832,12 +843,13 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->tes_shader != !!sel;
+	bool enable_changed = !!sctx->tes_shader.cso != !!sel;
 
-	if (sctx->tes_shader == sel)
+	if (sctx->tes_shader.cso == sel)
 		return;
 
-	sctx->tes_shader = sel;
+	sctx->tes_shader.cso = sel;
+	sctx->tes_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
@@ -864,7 +876,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	struct si_shader_selector *sel = state;
 
 	/* skip if supplied shader is one already in use */
-	if (sctx->ps_shader == sel)
+	if (sctx->ps_shader.cso == sel)
 		return;
 
 	/* use a dummy shader if binding a NULL shader */
@@ -873,7 +885,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 		sel = sctx->dummy_pixel_shader;
 	}
 
-	sctx->ps_shader = sel;
+	sctx->ps_shader.cso = sel;
+	sctx->ps_shader.current = sel->first_variant;
 	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
 }
 
@@ -881,8 +894,8 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-	struct si_shader *p = sel->current, *c;
-	struct si_shader_selector **current_shader[SI_NUM_SHADERS] = {
+	struct si_shader *p = sel->first_variant, *c;
+	struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
 		[PIPE_SHADER_VERTEX] = &sctx->vs_shader,
 		[PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
 		[PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
@@ -890,8 +903,10 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 		[PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
 	};
 
-	if (*current_shader[sel->type] == sel)
-		*current_shader[sel->type] = NULL;
+	if (current_shader[sel->type]->cso == sel) {
+		current_shader[sel->type]->cso = NULL;
+		current_shader[sel->type]->current = NULL;
+	}
 
 	while (p) {
 		c = p->next_variant;
@@ -927,6 +942,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 		p = c;
 	}
 
+	pipe_mutex_destroy(sel->mutex);
 	free(sel->tokens);
 	free(sel);
 }
@@ -934,7 +950,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	struct si_shader *ps = sctx->ps_shader->current;
+	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct tgsi_shader_info *psinfo = &ps->selector->info;
 	struct tgsi_shader_info *vsinfo = &vs->selector->info;
@@ -1004,7 +1020,7 @@ bcolor:
 static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	struct si_shader *ps = sctx->ps_shader->current;
+	struct si_shader *ps = sctx->ps_shader.current;
 	unsigned input_ena = ps->spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
@@ -1133,7 +1149,7 @@ static void si_init_gs_rings(struct si_context *sctx)
 
 static void si_update_gs_rings(struct si_context *sctx)
 {
-	unsigned gsvs_itemsize = sctx->gs_shader->gsvs_itemsize;
+	unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize;
 	uint64_t offset;
 
 	if (gsvs_itemsize == sctx->last_gsvs_itemsize)
@@ -1167,17 +1183,14 @@ static void si_update_gs_rings(struct si_context *sctx)
  *          < 0 if there was a failure
  */
 static int si_update_scratch_buffer(struct si_context *sctx,
-				    struct si_shader_selector *sel)
+				    struct si_shader *shader)
 {
-	struct si_shader *shader;
 	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
 	int r;
 
-	if (!sel)
+	if (!shader)
 		return 0;
 
-	shader = sel->current;
-
 	/* This shader doesn't need a scratch buffer */
 	if (shader->scratch_bytes_per_wave == 0)
 		return 0;
@@ -1209,20 +1222,20 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
 	return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0;
 }
 
-static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader_selector *sel)
+static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
 {
-	return sel ? sel->current->scratch_bytes_per_wave : 0;
+	return shader ? shader->scratch_bytes_per_wave : 0;
 }
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
 	unsigned bytes = 0;
 
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
 	return bytes;
 }
 
@@ -1256,46 +1269,46 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 		 * last used, so we still need to try to update them, even if
 		 * they require scratch buffers smaller than the current size.
 		 */
-		r = si_update_scratch_buffer(sctx, sctx->ps_shader);
+		r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1)
-			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
+			si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
 
-		r = si_update_scratch_buffer(sctx, sctx->gs_shader);
+		r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1)
-			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+			si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
 
-		r = si_update_scratch_buffer(sctx, sctx->tcs_shader);
+		r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1)
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
 
 		/* VS can be bound as LS, ES, or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->vs_shader);
+		r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1) {
-			if (sctx->tes_shader)
-				si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
-			else if (sctx->gs_shader)
-				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+			if (sctx->tes_shader.current)
+				si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+			else if (sctx->gs_shader.current)
+				si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
 			else
-				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+				si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
 		}
 
 		/* TES can be bound as ES or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->tes_shader);
+		r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1) {
-			if (sctx->gs_shader)
-				si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+			if (sctx->gs_shader.current)
+				si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
 			else
-				si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+				si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
 		}
 	}
 
@@ -1361,7 +1374,7 @@ static void si_generate_fixed_func_tcs(struct si_context *sctx)
 	if (!ureg)
 		return; /* if we get here, we're screwed */
 
-	assert(!sctx->fixed_func_tcs_shader);
+	assert(!sctx->fixed_func_tcs_shader.cso);
 
 	ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF);
 	const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0),
@@ -1376,7 +1389,7 @@ static void si_generate_fixed_func_tcs(struct si_context *sctx)
 	ureg_MOV(ureg, tessinner, const1);
 	ureg_END(ureg);
 
-	sctx->fixed_func_tcs_shader =
+	sctx->fixed_func_tcs_shader.cso =
 		ureg_create_shader_and_destroy(ureg, &sctx->b.b);
 }
 
@@ -1384,7 +1397,7 @@ static void si_update_vgt_shader_config(struct si_context *sctx)
 {
 	/* Calculate the index of the config.
 	 * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */
-	unsigned index = 2*!!sctx->tes_shader + !!sctx->gs_shader;
+	unsigned index = 2*!!sctx->tes_shader.cso + !!sctx->gs_shader.cso;
 	struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index];
 
 	if (!*pm4) {
@@ -1392,17 +1405,17 @@ static void si_update_vgt_shader_config(struct si_context *sctx)
 
 		*pm4 = CALLOC_STRUCT(si_pm4_state);
 
-		if (sctx->tes_shader) {
+		if (sctx->tes_shader.cso) {
 			stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
 				  S_028B54_HS_EN(1);
 
-			if (sctx->gs_shader)
+			if (sctx->gs_shader.cso)
 				stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
 					  S_028B54_GS_EN(1) |
 				          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
 			else
 				stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
-		} else if (sctx->gs_shader) {
+		} else if (sctx->gs_shader.cso) {
 			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
 				  S_028B54_GS_EN(1) |
 			          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
@@ -1432,7 +1445,7 @@ bool si_update_shaders(struct si_context *sctx)
 	int r;
 
 	/* Update stages before GS. */
-	if (sctx->tes_shader) {
+	if (sctx->tes_shader.cso) {
 		if (!sctx->tf_ring) {
 			si_init_tess_factor_ring(sctx);
 			if (!sctx->tf_ring)
@@ -1440,65 +1453,65 @@ bool si_update_shaders(struct si_context *sctx)
 		}
 
 		/* VS as LS */
-		r = si_shader_select(ctx, sctx->vs_shader);
+		r = si_shader_select(ctx, &sctx->vs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+		si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
 
-		if (sctx->tcs_shader) {
-			r = si_shader_select(ctx, sctx->tcs_shader);
+		if (sctx->tcs_shader.cso) {
+			r = si_shader_select(ctx, &sctx->tcs_shader);
 			if (r)
 				return false;
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
 		} else {
-			if (!sctx->fixed_func_tcs_shader) {
+			if (!sctx->fixed_func_tcs_shader.cso) {
 				si_generate_fixed_func_tcs(sctx);
-				if (!sctx->fixed_func_tcs_shader)
+				if (!sctx->fixed_func_tcs_shader.cso)
 					return false;
 			}
 
-			r = si_shader_select(ctx, sctx->fixed_func_tcs_shader);
+			r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader);
 			if (r)
 				return false;
 			si_pm4_bind_state(sctx, hs,
-					  sctx->fixed_func_tcs_shader->current->pm4);
+					  sctx->fixed_func_tcs_shader.current->pm4);
 		}
 
-		r = si_shader_select(ctx, sctx->tes_shader);
+		r = si_shader_select(ctx, &sctx->tes_shader);
 		if (r)
 			return false;
 
-		if (sctx->gs_shader) {
+		if (sctx->gs_shader.cso) {
 			/* TES as ES */
-			si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+			si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
 		} else {
 			/* TES as VS */
-			si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
-			si_update_so(sctx, sctx->tes_shader);
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+			si_update_so(sctx, sctx->tes_shader.cso);
 		}
-	} else if (sctx->gs_shader) {
+	} else if (sctx->gs_shader.cso) {
 		/* VS as ES */
-		r = si_shader_select(ctx, sctx->vs_shader);
+		r = si_shader_select(ctx, &sctx->vs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+		si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
 	} else {
 		/* VS as VS */
-		r = si_shader_select(ctx, sctx->vs_shader);
+		r = si_shader_select(ctx, &sctx->vs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
-		si_update_so(sctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+		si_update_so(sctx, sctx->vs_shader.cso);
 	}
 
 	/* Update GS. */
-	if (sctx->gs_shader) {
-		r = si_shader_select(ctx, sctx->gs_shader);
+	if (sctx->gs_shader.cso) {
+		r = si_shader_select(ctx, &sctx->gs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
-		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
-		si_update_so(sctx, sctx->gs_shader);
+		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+		si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
+		si_update_so(sctx, sctx->gs_shader.cso);
 
 		if (!sctx->gsvs_ring) {
 			si_init_gs_rings(sctx);
@@ -1514,10 +1527,10 @@ bool si_update_shaders(struct si_context *sctx)
 
 	si_update_vgt_shader_config(sctx);
 
-	r = si_shader_select(ctx, sctx->ps_shader);
+	r = si_shader_select(ctx, &sctx->ps_shader);
 	if (r)
 		return false;
-	si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
+	si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
 
 	if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
 	    sctx->sprite_coord_enable != rs->sprite_coord_enable ||
@@ -1543,13 +1556,13 @@ bool si_update_shaders(struct si_context *sctx)
 			return false;
 	}
 
-	if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
-		sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
+	if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
+		sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
-	if (sctx->smoothing_enabled != sctx->ps_shader->current->key.ps.poly_line_smoothing) {
-		sctx->smoothing_enabled = sctx->ps_shader->current->key.ps.poly_line_smoothing;
+	if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
+		sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
 		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
 		if (sctx->b.chip_class == SI)

From 38391835b5cbdd52e7a3221ff98f402aefa1639b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 17 Oct 2015 23:59:52 +0200
Subject: [PATCH 12/85] radeonsi: fix the export_prim_id field size in the
 shader key
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index b1076ed9183..fd5500c1ab3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -244,7 +244,7 @@ union si_shader_key {
 		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
-		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
+		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} vs;
 	struct {
 		unsigned	prim_mode:3;
@@ -255,7 +255,7 @@ union si_shader_key {
 		 * This describes how outputs are laid out in memory. */
 		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
-		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
+		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
 };
 

From 8339585b1206232c1df165108ef6adadb0829ab0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 18 Oct 2015 15:09:24 +0200
Subject: [PATCH 13/85] radeonsi: enable BC_OPTIMIZE if centroid isn't used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This solution was recommended by a Catalyst developer.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8b26b943e00..eea00e0fafc 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -404,6 +404,7 @@ static void si_shader_ps(struct si_shader *shader)
 	unsigned num_sgprs, num_user_sgprs;
 	unsigned spi_baryc_cntl = 0;
 	uint64_t va;
+	bool has_centroid;
 
 	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 
@@ -435,8 +436,11 @@ static void si_shader_ps(struct si_shader *shader)
 		}
 	}
 
+	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
+		       G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+
 	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
-		S_0286D8_BC_OPTIMIZE_DISABLE(1);
+			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 	si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);

From 67f489ded3a4c575e203dc82368ebe645e72079a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 17 Oct 2015 22:50:11 +0200
Subject: [PATCH 14/85] mesa: replace UsesClipDistance with
 ClipDistanceArraySize

This is more practical and needed by gallium.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/glsl/linker.cpp                | 39 ++++++++++++++----------------
 src/glsl/nir/glsl_to_nir.cpp       |  3 ++-
 src/mesa/drivers/dri/i965/brw_vs.c |  2 +-
 src/mesa/main/mtypes.h             |  5 +---
 src/mesa/main/shaderapi.c          |  6 ++---
 5 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 25ca928aa43..247052bcf4f 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -651,7 +651,7 @@ link_invalidate_variable_locations(exec_list *ir)
 
 
 /**
- * Set UsesClipDistance and ClipDistanceArraySize based on the given shader.
+ * Set clip_distance_array_size based on the given shader.
  *
  * Also check for errors based on incorrect usage of gl_ClipVertex and
  * gl_ClipDistance.
@@ -660,10 +660,10 @@ link_invalidate_variable_locations(exec_list *ir)
  */
 static void
 analyze_clip_usage(struct gl_shader_program *prog,
-                   struct gl_shader *shader, GLboolean *UsesClipDistance,
-                   GLuint *ClipDistanceArraySize)
+                   struct gl_shader *shader,
+                   GLuint *clip_distance_array_size)
 {
-   *ClipDistanceArraySize = 0;
+   *clip_distance_array_size = 0;
 
    if (!prog->IsES && prog->Version >= 130) {
       /* From section 7.1 (Vertex Shader Special Variables) of the
@@ -686,13 +686,14 @@ analyze_clip_usage(struct gl_shader_program *prog,
                       _mesa_shader_stage_to_string(shader->Stage));
          return;
       }
-      *UsesClipDistance = clip_distance.variable_found();
-      ir_variable *clip_distance_var =
-         shader->symbols->get_variable("gl_ClipDistance");
-      if (clip_distance_var)
-         *ClipDistanceArraySize = clip_distance_var->type->length;
-   } else {
-      *UsesClipDistance = false;
+
+      if (clip_distance.variable_found()) {
+         ir_variable *clip_distance_var =
+               shader->symbols->get_variable("gl_ClipDistance");
+
+         assert(clip_distance_var);
+         *clip_distance_array_size = clip_distance_var->type->length;
+      }
    }
 }
 
@@ -700,8 +701,7 @@ analyze_clip_usage(struct gl_shader_program *prog,
 /**
  * Verify that a vertex shader executable meets all semantic requirements.
  *
- * Also sets prog->Vert.UsesClipDistance and prog->Vert.ClipDistanceArraySize
- * as a side effect.
+ * Also sets prog->Vert.ClipDistanceArraySize as a side effect.
  *
  * \param shader  Vertex shader executable to be verified
  */
@@ -754,8 +754,7 @@ validate_vertex_shader_executable(struct gl_shader_program *prog,
       }
    }
 
-   analyze_clip_usage(prog, shader, &prog->Vert.UsesClipDistance,
-                      &prog->Vert.ClipDistanceArraySize);
+   analyze_clip_usage(prog, shader, &prog->Vert.ClipDistanceArraySize);
 }
 
 void
@@ -765,8 +764,7 @@ validate_tess_eval_shader_executable(struct gl_shader_program *prog,
    if (shader == NULL)
       return;
 
-   analyze_clip_usage(prog, shader, &prog->TessEval.UsesClipDistance,
-                      &prog->TessEval.ClipDistanceArraySize);
+   analyze_clip_usage(prog, shader, &prog->TessEval.ClipDistanceArraySize);
 }
 
 
@@ -797,8 +795,8 @@ validate_fragment_shader_executable(struct gl_shader_program *prog,
 /**
  * Verify that a geometry shader executable meets all semantic requirements
  *
- * Also sets prog->Geom.VerticesIn, prog->Geom.UsesClipDistance, and
- * prog->Geom.ClipDistanceArraySize as a side effect.
+ * Also sets prog->Geom.VerticesIn, and prog->Geom.ClipDistanceArraySize as
+ * a side effect.
  *
  * \param shader Geometry shader executable to be verified
  */
@@ -812,8 +810,7 @@ validate_geometry_shader_executable(struct gl_shader_program *prog,
    unsigned num_vertices = vertices_per_prim(prog->Geom.InputType);
    prog->Geom.VerticesIn = num_vertices;
 
-   analyze_clip_usage(prog, shader, &prog->Geom.UsesClipDistance,
-                      &prog->Geom.ClipDistanceArraySize);
+   analyze_clip_usage(prog, shader, &prog->Geom.ClipDistanceArraySize);
 }
 
 /**
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 76e1382c362..c9cdf35d6db 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -162,7 +162,8 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
    shader->info.outputs_written = sh->Program->OutputsWritten;
    shader->info.system_values_read = sh->Program->SystemValuesRead;
    shader->info.uses_texture_gather = sh->Program->UsesGather;
-   shader->info.uses_clip_distance_out = sh->Program->UsesClipDistanceOut;
+   shader->info.uses_clip_distance_out =
+      sh->Program->ClipDistanceArraySize != 0;
    shader->info.separate_shader = shader_prog->SeparateShader;
    shader->info.has_transform_feedback_varyings =
       shader_prog->TransformFeedback.NumVarying > 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index ba680a98f7e..5db4b3a86af 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -312,7 +312,7 @@ brw_vs_populate_key(struct brw_context *brw,
 
    if (ctx->Transform.ClipPlanesEnabled != 0 &&
        ctx->API == API_OPENGL_COMPAT &&
-       !vp->program.Base.UsesClipDistanceOut) {
+       vp->program.Base.ClipDistanceArraySize == 0) {
       key->nr_userclip_plane_consts =
          _mesa_logbase2(ctx->Transform.ClipPlanesEnabled) + 1;
    }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 9ca6deaabb6..20dd70ef734 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1891,7 +1891,7 @@ struct gl_program
     * For vertex and geometry shaders, true if the program uses the
     * gl_ClipDistance output.  Ignored for fragment shaders.
     */
-   GLboolean UsesClipDistanceOut;
+   unsigned ClipDistanceArraySize;
 
 
    /** Named parameters, constants, etc. from program text */
@@ -2614,7 +2614,6 @@ struct gl_shader_program
        * True if gl_ClipDistance is written to.  Copied into
        * gl_tess_eval_program by _mesa_copy_linked_program_data().
        */
-      GLboolean UsesClipDistance;
       GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
                                          0 if not present. */
    } TessEval;
@@ -2637,7 +2636,6 @@ struct gl_shader_program
        * True if gl_ClipDistance is written to.  Copied into
        * gl_geometry_program by _mesa_copy_linked_program_data().
        */
-      GLboolean UsesClipDistance;
       GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
                                          0 if not present. */
       bool UsesEndPrimitive;
@@ -2650,7 +2648,6 @@ struct gl_shader_program
        * True if gl_ClipDistance is written to.  Copied into gl_vertex_program
        * by _mesa_copy_linked_program_data().
        */
-      GLboolean UsesClipDistance;
       GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
                                          0 if not present. */
    } Vert;
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 18e463d4ccc..765602e50db 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2068,7 +2068,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
 {
    switch (type) {
    case MESA_SHADER_VERTEX:
-      dst->UsesClipDistanceOut = src->Vert.UsesClipDistance;
+      dst->ClipDistanceArraySize = src->Vert.ClipDistanceArraySize;
       break;
    case MESA_SHADER_TESS_CTRL: {
       struct gl_tess_ctrl_program *dst_tcp =
@@ -2083,7 +2083,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst_tep->Spacing = src->TessEval.Spacing;
       dst_tep->VertexOrder = src->TessEval.VertexOrder;
       dst_tep->PointMode = src->TessEval.PointMode;
-      dst->UsesClipDistanceOut = src->TessEval.UsesClipDistance;
+      dst->ClipDistanceArraySize = src->TessEval.ClipDistanceArraySize;
       break;
    }
    case MESA_SHADER_GEOMETRY: {
@@ -2093,7 +2093,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst_gp->Invocations = src->Geom.Invocations;
       dst_gp->InputType = src->Geom.InputType;
       dst_gp->OutputType = src->Geom.OutputType;
-      dst->UsesClipDistanceOut = src->Geom.UsesClipDistance;
+      dst->ClipDistanceArraySize = src->Geom.ClipDistanceArraySize;
       dst_gp->UsesEndPrimitive = src->Geom.UsesEndPrimitive;
       dst_gp->UsesStreams = src->Geom.UsesStreams;
       break;

From e70c66197ea10cf052010c7352420a2ae0b0a50a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 17 Oct 2015 23:15:28 +0200
Subject: [PATCH 15/85] gallium: add new properties for clip and cull distance
 usage

The TGSI usage mask can't be used, because these are declared as an output
array of 2 elements.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_strings.c  |  2 ++
 src/gallium/docs/source/tgsi.rst           | 10 ++++++++++
 src/gallium/include/pipe/p_shader_tokens.h |  4 +++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 8271ea08177..89369d60f4e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -137,6 +137,8 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
    "TES_SPACING",
    "TES_VERTEX_ORDER_CW",
    "TES_POINT_MODE",
+   "NUM_CLIPDIST_ENABLED",
+   "NUM_CULLDIST_ENABLED",
 };
 
 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 314fe1bb74f..01e18f3084e 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -3126,6 +3126,16 @@ TES_POINT_MODE
 If set to a non-zero value, this turns on point mode for the tessellator,
 which means that points will be generated instead of primitives.
 
+NUM_CLIPDIST_ENABLED
+""""""""""""""""
+
+How many clip distance scalar outputs are enabled.
+
+NUM_CULLDIST_ENABLED
+""""""""""""""""
+
+How many cull distance scalar outputs are enabled.
+
 
 Texture Sampling and Texture Formats
 ------------------------------------
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index b36e0a35b8d..e0ab9013dd5 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -267,7 +267,9 @@ union tgsi_immediate_data
 #define TGSI_PROPERTY_TES_SPACING            12
 #define TGSI_PROPERTY_TES_VERTEX_ORDER_CW    13
 #define TGSI_PROPERTY_TES_POINT_MODE         14
-#define TGSI_PROPERTY_COUNT                  15
+#define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED   15
+#define TGSI_PROPERTY_NUM_CULLDIST_ENABLED   16
+#define TGSI_PROPERTY_COUNT                  17
 
 struct tgsi_property {
    unsigned Type         : 4;  /**< TGSI_TOKEN_TYPE_PROPERTY */

From 7c75f23cb92f7e4a55f2ae31df9274338ec60531 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 17 Oct 2015 23:17:52 +0200
Subject: [PATCH 16/85] st/mesa: pass the clip distance array size to drivers

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/state_tracker/st_program.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 87571a88e78..75ccaf2f26b 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -395,6 +395,10 @@ st_translate_vertex_program(struct st_context *st,
    if (ureg == NULL)
       return false;
 
+   if (stvp->Base.Base.ClipDistanceArraySize)
+      ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED,
+                    stvp->Base.Base.ClipDistanceArraySize);
+
    if (ST_DEBUG & DEBUG_MESA) {
       _mesa_print_program(&stvp->Base.Base);
       _mesa_print_program_parameters(st->ctx, &stvp->Base.Base);
@@ -1049,6 +1053,10 @@ st_translate_program_common(struct st_context *st,
    memset(outputMapping, 0, sizeof(outputMapping));
    memset(out_state, 0, sizeof(*out_state));
 
+   if (prog->ClipDistanceArraySize)
+      ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED,
+                    prog->ClipDistanceArraySize);
+
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */

From 8910ebd8e8ed7e163ae69bb85cda55531675e95d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 17 Oct 2015 23:17:52 +0200
Subject: [PATCH 17/85] tgsi/scan: use properties for clip/cull distance
 writemasks

No changes needed for drivers already relying on tgsi_shader_info.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 28 +++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index b84a1753eeb..4645ef26cab 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -369,19 +369,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                       procType == TGSI_PROCESSOR_GEOMETRY ||
                       procType == TGSI_PROCESSOR_TESS_CTRL ||
                       procType == TGSI_PROCESSOR_TESS_EVAL) {
-                     if (semName == TGSI_SEMANTIC_CLIPDIST) {
-                        info->num_written_clipdistance +=
-                           util_bitcount(fulldecl->Declaration.UsageMask);
-                        info->clipdist_writemask |=
-                           fulldecl->Declaration.UsageMask << (semIndex*4);
-                     }
-                     else if (semName == TGSI_SEMANTIC_CULLDIST) {
-                        info->num_written_culldistance +=
-                           util_bitcount(fulldecl->Declaration.UsageMask);
-                        info->culldist_writemask |=
-                           fulldecl->Declaration.UsageMask << (semIndex*4);
-                     }
-                     else if (semName == TGSI_SEMANTIC_VIEWPORT_INDEX) {
+                     if (semName == TGSI_SEMANTIC_VIEWPORT_INDEX) {
                         info->writes_viewport_index = TRUE;
                      }
                      else if (semName == TGSI_SEMANTIC_LAYER) {
@@ -432,9 +420,21 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
             const struct tgsi_full_property *fullprop
                = &parse.FullToken.FullProperty;
             unsigned name = fullprop->Property.PropertyName;
+            unsigned value = fullprop->u[0].Data;
 
             assert(name < Elements(info->properties));
-            info->properties[name] = fullprop->u[0].Data;
+            info->properties[name] = value;
+
+            switch (name) {
+            case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+               info->num_written_clipdistance = value;
+               info->clipdist_writemask |= (1 << value) - 1;
+               break;
+            case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+               info->num_written_culldistance = value;
+               info->culldist_writemask |= (1 << value) - 1;
+               break;
+            }
          }
          break;
 

From 85b946478c326df853926ed18bfbd898c0a514ef Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 20 Oct 2015 10:49:10 +0100
Subject: [PATCH 18/85] vc4: Add limited support for ibfe/ubfe.

This is just enough to cover our unpack modes, which will be used by some
new NIR-based lowering in the next commit.
---
 src/gallium/drivers/vc4/vc4_program.c | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 1b590a2d0c4..d3e856a8530 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -876,6 +876,40 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
         *dest = result;
 }
 
+/** Handles sign-extended bitfield extracts for 16 bits. */
+static struct qreg
+ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
+              struct qreg bits)
+{
+        assert(bits.file == QFILE_UNIF &&
+               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
+               c->uniform_data[bits.index] == 16);
+
+        assert(offset.file == QFILE_UNIF &&
+               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
+        int offset_bit = c->uniform_data[offset.index];
+        assert(offset_bit % 16 == 0);
+
+        return qir_UNPACK_16_I(c, base, offset_bit / 16);
+}
+
+/** Handles unsigned bitfield extracts for 8 bits. */
+static struct qreg
+ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
+              struct qreg bits)
+{
+        assert(bits.file == QFILE_UNIF &&
+               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
+               c->uniform_data[bits.index] == 8);
+
+        assert(offset.file == QFILE_UNIF &&
+               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
+        int offset_bit = c->uniform_data[offset.index];
+        assert(offset_bit % 8 == 0);
+
+        return qir_UNPACK_8_I(c, base, offset_bit / 8);
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -1106,6 +1140,14 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                                 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
                 break;
 
+        case nir_op_ibitfield_extract:
+                *dest = ntq_emit_ibfe(c, src[0], src[1], src[2]);
+                break;
+
+        case nir_op_ubitfield_extract:
+                *dest = ntq_emit_ubfe(c, src[0], src[1], src[2]);
+                break;
+
         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
                 nir_print_instr(&instr->instr, stderr);

From 921feb8782bdc3c459922858bee6d55919467436 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 20:58:57 -0700
Subject: [PATCH 19/85] vc4: Switch our vertex attr lowering to being
 NIR-based.

This exposes more information to NIR's optimization, and should be
particularly useful when we do range-based optimization.

total uniforms in shared programs: 32066 -> 32065 (-0.00%)
uniforms in affected programs:     21 -> 20 (-4.76%)
total instructions in shared programs: 93104 -> 92630 (-0.51%)
instructions in affected programs:     31901 -> 31427 (-1.49%)
---
 src/gallium/drivers/vc4/vc4_nir_lower_io.c | 233 ++++++++++++++++++---
 src/gallium/drivers/vc4/vc4_program.c      | 110 +---------
 2 files changed, 200 insertions(+), 143 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 761e2c819c5..caf706aa2a6 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -23,6 +23,7 @@
 
 #include "vc4_qir.h"
 #include "glsl/nir/nir_builder.h"
+#include "util/u_format.h"
 
 /**
  * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
@@ -50,14 +51,182 @@ replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
         nir_instr_remove(&intr->instr);
 }
 
+static nir_ssa_def *
+vc4_nir_unpack_8i(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_ubitfield_extract(b,
+                                     src,
+                                     nir_imm_int(b, 8 * chan),
+                                     nir_imm_int(b, 8));
+}
+
+/** Returns the 16 bit field as a sign-extended 32-bit value. */
+static nir_ssa_def *
+vc4_nir_unpack_16i(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_ibitfield_extract(b,
+                                     src,
+                                     nir_imm_int(b, 16 * chan),
+                                     nir_imm_int(b, 16));
+}
+
+/** Returns the 16 bit field as an unsigned 32 bit value. */
+static nir_ssa_def *
+vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        if (chan == 0) {
+                return nir_iand(b, src, nir_imm_int(b, 0xffff));
+        } else {
+                return nir_ushr(b, src, nir_imm_int(b, 16));
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_swizzle(b, nir_unpack_unorm_4x8(b, src), &chan, 1, false);
+}
+
+static nir_ssa_def *
+vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
+                              nir_builder *b,
+                              nir_ssa_def **vpm_reads,
+                              uint8_t swiz,
+                              const struct util_format_description *desc)
+{
+        const struct util_format_channel_description *chan =
+                &desc->channel[swiz];
+        nir_ssa_def *temp;
+
+        if (swiz > UTIL_FORMAT_SWIZZLE_W) {
+                return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz);
+        } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+                return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz);
+        } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                if (chan->normalized) {
+                        return nir_fmul(b,
+                                        nir_i2f(b, vpm_reads[swiz]),
+                                        nir_imm_float(b,
+                                                      1.0 / 0x7fffffff));
+                } else {
+                        return nir_i2f(b, vpm_reads[swiz]);
+                }
+        } else if (chan->size == 8 &&
+                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
+                nir_ssa_def *vpm = vpm_reads[0];
+                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                        temp = nir_ixor(b, vpm, nir_imm_int(b, 0x80808080));
+                        if (chan->normalized) {
+                                return nir_fsub(b, nir_fmul(b,
+                                                            vc4_nir_unpack_8f(b, temp, swiz),
+                                                            nir_imm_float(b, 2.0)),
+                                                nir_imm_float(b, 1.0));
+                        } else {
+                                return nir_fadd(b,
+                                                nir_i2f(b,
+                                                        vc4_nir_unpack_8i(b, temp,
+                                                                          swiz)),
+                                                nir_imm_float(b, -128.0));
+                        }
+                } else {
+                        if (chan->normalized) {
+                                return vc4_nir_unpack_8f(b, vpm, swiz);
+                        } else {
+                                return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz));
+                        }
+                }
+        } else if (chan->size == 16 &&
+                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
+                nir_ssa_def *vpm = vpm_reads[swiz / 2];
+
+                /* Note that UNPACK_16F eats a half float, not ints, so we use
+                 * UNPACK_16_I for all of these.
+                 */
+                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                        temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
+                        if (chan->normalized) {
+                                return nir_fmul(b, temp,
+                                                nir_imm_float(b, 1/32768.0f));
+                        } else {
+                                return temp;
+                        }
+                } else {
+                        temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
+                        if (chan->normalized) {
+                                return nir_fmul(b, temp,
+                                                nir_imm_float(b, 1 / 65535.0));
+                        } else {
+                                return temp;
+                        }
+                }
+        } else {
+                return NULL;
+        }
+}
+
 static void
-vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
-                    nir_intrinsic_instr *intr)
+vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
-            VC4_NIR_TLB_COLOR_READ_INPUT) {
+        int attr = intr->const_index[0];
+        enum pipe_format format = c->vs_key->attr_formats[attr];
+        uint32_t attr_size = util_format_get_blocksize(format);
+
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        /* Generate dword loads for the VPM values (Since these intrinsics may
+         * be reordered, the actual reads will be generated at the top of the
+         * shader by ntq_setup_inputs().
+         */
+        nir_ssa_def *vpm_reads[4];
+        for (int i = 0; i < align(attr_size, 4) / 4; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s,
+                                                   nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                vpm_reads[i] = &intr_comp->dest.ssa;
+        }
+
+        bool format_warned = false;
+        const struct util_format_description *desc =
+                util_format_description(format);
+
+        nir_ssa_def *dests[4];
+        for (int i = 0; i < 4; i++) {
+                uint8_t swiz = desc->swizzle[i];
+                dests[i] = vc4_nir_get_vattr_channel_vpm(c, b, vpm_reads, swiz,
+                                                         desc);
+
+                if (!dests[i]) {
+                        if (!format_warned) {
+                                fprintf(stderr,
+                                        "vtx element %d unsupported type: %s\n",
+                                        attr, util_format_name(format));
+                                format_warned = true;
+                        }
+                        dests[i] = nir_imm_float(b, 0.0);
+                }
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
+                       nir_intrinsic_instr *intr)
+{
+        b->cursor = nir_before_instr(&intr->instr);
+
+        if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
                 /* This doesn't need any lowering. */
                 return;
         }
@@ -87,38 +256,31 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                 dests[i] = &intr_comp->dest.ssa;
         }
 
-        switch (c->stage) {
-        case QSTAGE_FRAG:
-                if (input_var->data.location == VARYING_SLOT_FACE) {
-                        dests[0] = nir_fsub(b,
-                                            nir_imm_float(b, 1.0),
-                                            nir_fmul(b,
-                                                     nir_i2f(b, dests[0]),
-                                                     nir_imm_float(b, 2.0)));
-                        dests[1] = nir_imm_float(b, 0.0);
+        if (input_var->data.location == VARYING_SLOT_FACE) {
+                dests[0] = nir_fsub(b,
+                                    nir_imm_float(b, 1.0),
+                                    nir_fmul(b,
+                                             nir_i2f(b, dests[0]),
+                                             nir_imm_float(b, 2.0)));
+                dests[1] = nir_imm_float(b, 0.0);
+                dests[2] = nir_imm_float(b, 0.0);
+                dests[3] = nir_imm_float(b, 1.0);
+        } else if (input_var->data.location >= VARYING_SLOT_VAR0) {
+                if (c->fs_key->point_sprite_mask &
+                    (1 << (input_var->data.location -
+                           VARYING_SLOT_VAR0))) {
+                        if (!c->fs_key->is_points) {
+                                dests[0] = nir_imm_float(b, 0.0);
+                                dests[1] = nir_imm_float(b, 0.0);
+                        }
+                        if (c->fs_key->point_coord_upper_left) {
+                                dests[1] = nir_fsub(b,
+                                                    nir_imm_float(b, 1.0),
+                                                    dests[1]);
+                        }
                         dests[2] = nir_imm_float(b, 0.0);
                         dests[3] = nir_imm_float(b, 1.0);
-                } else if (input_var->data.location >= VARYING_SLOT_VAR0) {
-                        if (c->fs_key->point_sprite_mask &
-                            (1 << (input_var->data.location -
-                                   VARYING_SLOT_VAR0))) {
-                                if (!c->fs_key->is_points) {
-                                        dests[0] = nir_imm_float(b, 0.0);
-                                        dests[1] = nir_imm_float(b, 0.0);
-                                }
-                                if (c->fs_key->point_coord_upper_left) {
-                                        dests[1] = nir_fsub(b,
-                                                            nir_imm_float(b, 1.0),
-                                                            dests[1]);
-                                }
-                                dests[2] = nir_imm_float(b, 0.0);
-                                dests[3] = nir_imm_float(b, 1.0);
-                        }
                 }
-                break;
-        case QSTAGE_COORD:
-        case QSTAGE_VERT:
-                break;
         }
 
         replace_intrinsic_with_vec4(b, intr, dests);
@@ -232,7 +394,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
 
         switch (intr->intrinsic) {
         case nir_intrinsic_load_input:
-                vc4_nir_lower_input(c, b, intr);
+                if (c->stage == QSTAGE_FRAG)
+                        vc4_nir_lower_fs_input(c, b, intr);
+                else
+                        vc4_nir_lower_vertex_attr(c, b, intr);
                 break;
 
         case nir_intrinsic_store_output:
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index d3e856a8530..6e9ec6530c6 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -602,126 +602,18 @@ ntq_fsign(struct vc4_compile *c, struct qreg src)
                               qir_uniform_f(c, -1.0));
 }
 
-static struct qreg
-get_channel_from_vpm(struct vc4_compile *c,
-                     struct qreg *vpm_reads,
-                     uint8_t swiz,
-                     const struct util_format_description *desc)
-{
-        const struct util_format_channel_description *chan =
-                &desc->channel[swiz];
-        struct qreg temp;
-
-        if (swiz > UTIL_FORMAT_SWIZZLE_W)
-                return get_swizzled_channel(c, vpm_reads, swiz);
-        else if (chan->size == 32 &&
-                 chan->type == UTIL_FORMAT_TYPE_FLOAT) {
-                return get_swizzled_channel(c, vpm_reads, swiz);
-        } else if (chan->size == 32 &&
-                   chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                if (chan->normalized) {
-                        return qir_FMUL(c,
-                                        qir_ITOF(c, vpm_reads[swiz]),
-                                        qir_uniform_f(c,
-                                                      1.0 / 0x7fffffff));
-                } else {
-                        return qir_ITOF(c, vpm_reads[swiz]);
-                }
-        } else if (chan->size == 8 &&
-                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
-                struct qreg vpm = vpm_reads[0];
-                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = qir_XOR(c, vpm, qir_uniform_ui(c, 0x80808080));
-                        if (chan->normalized) {
-                                return qir_FSUB(c, qir_FMUL(c,
-                                                            qir_UNPACK_8_F(c, temp, swiz),
-                                                            qir_uniform_f(c, 2.0)),
-                                                qir_uniform_f(c, 1.0));
-                        } else {
-                                return qir_FADD(c,
-                                                qir_ITOF(c,
-                                                         qir_UNPACK_8_I(c, temp,
-                                                                        swiz)),
-                                                qir_uniform_f(c, -128.0));
-                        }
-                } else {
-                        if (chan->normalized) {
-                                return qir_UNPACK_8_F(c, vpm, swiz);
-                        } else {
-                                return qir_ITOF(c, qir_UNPACK_8_I(c, vpm, swiz));
-                        }
-                }
-        } else if (chan->size == 16 &&
-                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
-                struct qreg vpm = vpm_reads[swiz / 2];
-
-                /* Note that UNPACK_16F eats a half float, not ints, so we use
-                 * UNPACK_16_I for all of these.
-                 */
-                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = qir_ITOF(c, qir_UNPACK_16_I(c, vpm, swiz % 2));
-                        if (chan->normalized) {
-                                return qir_FMUL(c, temp,
-                                                qir_uniform_f(c, 1/32768.0f));
-                        } else {
-                                return temp;
-                        }
-                } else {
-                        /* UNPACK_16I sign-extends, so we have to emit ANDs. */
-                        temp = vpm;
-                        if (swiz == 1 || swiz == 3)
-                                temp = qir_UNPACK_16_I(c, temp, 1);
-                        temp = qir_AND(c, temp, qir_uniform_ui(c, 0xffff));
-                        temp = qir_ITOF(c, temp);
-
-                        if (chan->normalized) {
-                                return qir_FMUL(c, temp,
-                                                qir_uniform_f(c, 1 / 65535.0));
-                        } else {
-                                return temp;
-                        }
-                }
-        } else {
-                return c->undef;
-        }
-}
-
 static void
 emit_vertex_input(struct vc4_compile *c, int attr)
 {
         enum pipe_format format = c->vs_key->attr_formats[attr];
         uint32_t attr_size = util_format_get_blocksize(format);
-        struct qreg vpm_reads[4];
 
         c->vattr_sizes[attr] = align(attr_size, 4);
         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
                 struct qreg vpm = { QFILE_VPM, attr * 4 + i };
-                vpm_reads[i] = qir_MOV(c, vpm);
+                c->inputs[attr * 4 + i] = qir_MOV(c, vpm);
                 c->num_inputs++;
         }
-
-        bool format_warned = false;
-        const struct util_format_description *desc =
-                util_format_description(format);
-
-        for (int i = 0; i < 4; i++) {
-                uint8_t swiz = desc->swizzle[i];
-                struct qreg result = get_channel_from_vpm(c, vpm_reads,
-                                                          swiz, desc);
-
-                if (result.file == QFILE_NULL) {
-                        if (!format_warned) {
-                                fprintf(stderr,
-                                        "vtx element %d unsupported type: %s\n",
-                                        attr, util_format_name(format));
-                                format_warned = true;
-                        }
-                        result = qir_uniform_f(c, 0.0);
-                }
-                c->inputs[attr * 4 + i] = result;
-        }
 }
 
 static void

From 814f31457e9ae83d4f1e39236f704721b279b73d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 20 Oct 2015 18:26:02 +0200
Subject: [PATCH 20/85] gallium: add PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT

This avoids a serious r600g bug leading to a GPU hang.
The chances this bug will get fixed are pretty low now.

I deeply regret listening to others and not pushing this patch, leaving
other users with a GPU-crashing driver. Yes, it should be fixed
in the compiler and it's ugly, but users couldn't care less about that.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=86720

Cc: 11.0 10.6 <mesa-stable@lists.freedesktop.org>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_limits.h    | 2 ++
 src/gallium/auxiliary/tgsi/tgsi_exec.h           | 2 ++
 src/gallium/docs/source/screen.rst               | 4 ++++
 src/gallium/drivers/freedreno/freedreno_screen.c | 2 ++
 src/gallium/drivers/i915/i915_screen.c           | 2 ++
 src/gallium/drivers/ilo/ilo_screen.c             | 2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 4 ++++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 2 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 2 ++
 src/gallium/drivers/r300/r300_screen.c           | 4 ++++
 src/gallium/drivers/r600/r600_pipe.c             | 6 ++++++
 src/gallium/drivers/radeonsi/si_pipe.c           | 2 ++
 src/gallium/drivers/svga/svga_screen.c           | 4 ++++
 src/gallium/drivers/vc4/vc4_screen.c             | 2 ++
 src/gallium/include/pipe/p_defines.h             | 3 ++-
 src/mesa/state_tracker/st_extensions.c           | 3 +++
 16 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 571c615f9f8..ad64ae058b6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -137,6 +137,8 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    }
    /* if we get here, we missed a shader cap above (and should have seen
     * a compiler warning.)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index a371aa95e70..f86adcec506 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -474,6 +474,8 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    }
    /* if we get here, we missed a shader cap above (and should have seen
     * a compiler warning.)
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 72f7596886d..151afb2dffe 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -367,6 +367,10 @@ to be 0.
   are supported.
 * ``PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE``: Whether the driver doesn't
   ignore tgsi_declaration_range::Last for shader inputs and outputs.
+* ``PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT``: This is the maximum number
+  of iterations that loops are allowed to have to be unrolled. It is only
+  a hint to state trackers. Whether any loops will be unrolled is not
+  guaranteed.
 
 
 .. _pipe_compute_cap:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index f85e4586413..50d140fe903 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -412,6 +412,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index c91408d3d9b..5812af626cb 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -167,6 +167,8 @@ i915_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_sha
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       default:
          debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap);
          return 0;
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index acf688fc02c..e1a7dc56685 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -138,6 +138,8 @@ ilo_get_shader_param(struct pipe_screen *screen, unsigned shader,
       return PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
       return 1;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
 
    default:
       return 0;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index d4cf143b9a3..03301649e38 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -264,6 +264,8 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       default:
          debug_printf("unknown vertex shader param %d\n", param);
          return 0;
@@ -305,6 +307,8 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       default:
          debug_printf("unknown fragment shader param %d\n", param);
          return 0;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index a4431f20e14..ec51d00f266 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -300,6 +300,8 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
       return 0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index d34c8a2b07b..af8e5f72670 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -313,6 +313,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       return 16; /* would be 32 in linked (OpenGL-style) mode */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
       return 16; /* XXX not sure if more are really safe */
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
       return 0;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index c1c522b0a3a..a576abdfaf2 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -303,6 +303,8 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
             return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+            return 32;
         case PIPE_SHADER_CAP_PREFERRED_IR:
             return PIPE_SHADER_IR_TGSI;
         }
@@ -359,6 +361,8 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
             return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+            return 32;
         case PIPE_SHADER_CAP_PREFERRED_IR:
             return PIPE_SHADER_IR_TGSI;
         }
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 75de553be2b..9a97de9965e 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -511,6 +511,12 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 		return 0;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		/* due to a bug in the shader compiler, some loops hang
+		 * if they are not unrolled, see:
+		 *    https://bugs.freedesktop.org/show_bug.cgi?id=86720
+		 */
+		return 255;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c084f03cd25..5f910c95ef3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -508,6 +508,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 9bf661fab8c..17b042e7d95 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -456,6 +456,8 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       }
       /* If we get here, we failed to handle a cap above */
       debug_printf("Unexpected fragment shader query %u\n", param);
@@ -512,6 +514,8 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       }
       /* If we get here, we failed to handle a cap above */
       debug_printf("Unexpected vertex shader query %u\n", param);
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 3b12464a2f6..774ec095652 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -337,6 +337,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return VC4_MAX_TEXTURE_SAMPLERS;
         case PIPE_SHADER_CAP_PREFERRED_IR:
                 return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 3a1265dcc22..1ad545aae09 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -697,7 +697,8 @@ enum pipe_shader_cap
    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED, /* all rounding modes */
    PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED,
    PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED,
-   PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE
+   PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE,
+   PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT,
 };
 
 /**
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index e2902923cb7..d4724b46e0a 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -249,6 +249,9 @@ void st_init_limits(struct pipe_screen *screen,
 
       if (options->EmitNoLoops)
          options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536);
+      else
+         options->MaxUnrollIterations = screen->get_shader_param(screen, sh,
+                                      PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT);
 
       options->LowerClipDistance = true;
    }

From 6994d8ec01273f15d91e5330960194630c1c02e0 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.l.velikov@gmail.com>
Date: Sat, 17 Oct 2015 23:42:13 +0100
Subject: [PATCH 21/85] i965: silence incompatible pointer type warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

src/mesa/drivers/dri/i965/brw_program.c:94:39:
warning: passing argument 1 of ‘_mesa_init_gl_program’ from incompatible
pointer type [-Wincompatible-pointer-types]
          return _mesa_init_gl_program(&prog->program, target, id);

                                       ^

Runtime was unaffected as brw_geometry_program is subclassed from
gl_geometry_program, thus the address passed was the same.

Fixes: bcb56c2c69d (program: convert _mesa_init_gl_program() to take
struct gl_program *)
Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_program.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index b547d07f0ca..1ccfa1b6a1d 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -91,7 +91,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
       if (prog) {
          prog->id = get_new_program_id(brw->intelScreen);
 
-         return _mesa_init_gl_program(&prog->program, target, id);
+         return _mesa_init_gl_program(&prog->program.Base, target, id);
       } else {
          return NULL;
       }

From b1f8ef5ae3697b9b73f2ad7d07714f945c5175c3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 08:40:54 -0600
Subject: [PATCH 22/85] mesa: add more cases to print_list() in dlist.c

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/mesa/main/dlist.c | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index e8059c7b260..fdb839c2c44 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -9741,6 +9741,46 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
                    n[3].f, n[4].f, n[5].f, n[6].f,
                    get_pointer(&n[7]));
             break;
+         case OPCODE_BLEND_COLOR:
+            fprintf(f, "BlendColor %f, %f, %f, %f\n",
+                    n[1].f, n[2].f, n[3].f, n[4].f);
+            break;
+         case OPCODE_BLEND_EQUATION:
+            fprintf(f, "BlendEquation %s\n",
+                    enum_string(n[1].e));
+            break;
+         case OPCODE_BLEND_EQUATION_SEPARATE:
+            fprintf(f, "BlendEquationSeparate %s, %s\n",
+                    enum_string(n[1].e),
+                    enum_string(n[2].e));
+            break;
+         case OPCODE_BLEND_FUNC_SEPARATE:
+            fprintf(f, "BlendFuncSeparate %s, %s, %s, %s\n",
+                    enum_string(n[1].e),
+                    enum_string(n[2].e),
+                    enum_string(n[3].e),
+                    enum_string(n[4].e));
+            break;
+         case OPCODE_BLEND_EQUATION_I:
+            fprintf(f, "BlendEquationi %u, %s\n",
+                    n[1].ui, enum_string(n[2].e));
+            break;
+         case OPCODE_BLEND_EQUATION_SEPARATE_I:
+            fprintf(f, "BlendEquationSeparatei %u, %s, %s\n",
+                    n[1].ui, enum_string(n[2].e), enum_string(n[3].e));
+            break;
+         case OPCODE_BLEND_FUNC_I:
+            fprintf(f, "BlendFunci %u, %s, %s\n",
+                    n[1].ui, enum_string(n[2].e), enum_string(n[3].e));
+            break;
+         case OPCODE_BLEND_FUNC_SEPARATE_I:
+            fprintf(f, "BlendFuncSeparatei %u, %s, %s, %s, %s\n",
+                    n[1].ui,
+                    enum_string(n[2].e),
+                    enum_string(n[3].e),
+                    enum_string(n[4].e),
+                    enum_string(n[5].e));
+            break;
          case OPCODE_CALL_LIST:
             fprintf(f, "CallList %d\n", (int) n[1].ui);
             break;
@@ -9761,6 +9801,9 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
          case OPCODE_LINE_STIPPLE:
             fprintf(f, "LineStipple %d %x\n", n[1].i, (int) n[2].us);
             break;
+         case OPCODE_LINE_WIDTH:
+            fprintf(f, "LineWidth %f\n", n[1].f);
+            break;
          case OPCODE_LOAD_IDENTITY:
             fprintf(f, "LoadIdentity\n");
             break;
@@ -9790,6 +9833,9 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
             fprintf(f, "Ortho %g %g %g %g %g %g\n",
                          n[1].f, n[2].f, n[3].f, n[4].f, n[5].f, n[6].f);
             break;
+         case OPCODE_POINT_SIZE:
+            fprintf(f, "PointSize %f\n", n[1].f);
+            break;
          case OPCODE_POP_ATTRIB:
             fprintf(f, "PopAttrib\n");
             break;

From e24d04e436ed48d4a0aac90590cbaa40da936208 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 08:43:02 -0600
Subject: [PATCH 23/85] mesa: fix incorrect opcode in save_BlendFunci()

Fixes assertion failure with new piglit
arb_draw_buffers_blend-state_set_get test.

Cc: mesa-stable@lists.freedesktop.org

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/mesa/main/dlist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index fdb839c2c44..2b65b2ea949 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -1400,7 +1400,7 @@ save_BlendFunci(GLuint buf, GLenum sfactor, GLenum dfactor)
    GET_CURRENT_CONTEXT(ctx);
    Node *n;
    ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
-   n = alloc_instruction(ctx, OPCODE_BLEND_FUNC_SEPARATE_I, 3);
+   n = alloc_instruction(ctx, OPCODE_BLEND_FUNC_I, 3);
    if (n) {
       n[1].ui = buf;
       n[2].e = sfactor;

From c5de38abc9eb71ba89fb1332946ee034e5a0c649 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 09:32:50 -0600
Subject: [PATCH 24/85] st/mesa: use MAX3() instead of MAX2(MAX2) in
 draw_textured_quad()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_cb_drawpixels.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 20cbfdefd23..c0e0484042b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -667,7 +667,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* user textures, plus the drawpix textures */
    if (fpv) {
       struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
-      uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1,
+      uint num = MAX3(fpv->drawpix_sampler + 1,
+                      fpv->pixelmap_sampler + 1,
                       st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]);
 
       memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT],

From 31ae52acceb7defc84080e32db6d5b1b0fe2eace Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 09:38:00 -0600
Subject: [PATCH 25/85] st/mesa: check for out-of-memory in st_DrawPixels()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before, if make_texture() or st_create_texture_sampler_view() failed
we silently no-op'd the glDrawPixels.  Now, set GL_OUT_OF_MEMORY.
This also allows us to un-nest a bunch of code.

v2: also check if allocation of sv[1] fails, per Jose.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_cb_drawpixels.c | 82 +++++++++++++----------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index c0e0484042b..de7d1f6489a 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -975,6 +975,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    int num_sampler_view = 1;
    struct gl_pixelstore_attrib clippedUnpack;
    struct st_fp_variant *fpv = NULL;
+   struct pipe_resource *pt;
 
    /* Mesa state should be up to date by now */
    assert(ctx->NewState == 0x0);
@@ -1030,42 +1031,53 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
       st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
    }
 
-   /* draw with textured quad */
-   {
-      struct pipe_resource *pt
-         = make_texture(st, width, height, format, type, unpack, pixels);
-      if (pt) {
-         sv[0] = st_create_texture_sampler_view(st->pipe, pt);
-
-         if (sv[0]) {
-            /* Create a second sampler view to read stencil.
-             * The stencil is written using the shader stencil export
-             * functionality. */
-            if (write_stencil) {
-               enum pipe_format stencil_format =
-                     util_format_stencil_only(pt->format);
-               /* we should not be doing pixel map/transfer (see above) */
-               assert(num_sampler_view == 1);
-               sv[1] = st_create_texture_sampler_view_format(st->pipe, pt,
-                                                             stencil_format);
-               num_sampler_view++;
-            }
-
-            draw_textured_quad(ctx, x, y, ctx->Current.RasterPos[2],
-                               width, height,
-                               ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
-                               sv,
-                               num_sampler_view,
-                               driver_vp,
-                               driver_fp, fpv,
-                               color, GL_FALSE, write_depth, write_stencil);
-            pipe_sampler_view_reference(&sv[0], NULL);
-            if (num_sampler_view > 1)
-               pipe_sampler_view_reference(&sv[1], NULL);
-         }
-         pipe_resource_reference(&pt, NULL);
-      }
+   /* Put glDrawPixels image into a texture */
+   pt = make_texture(st, width, height, format, type, unpack, pixels);
+   if (!pt) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
+      return;
    }
+
+   /* create sampler view for the image */
+   sv[0] = st_create_texture_sampler_view(st->pipe, pt);
+   if (!sv[0]) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
+      pipe_resource_reference(&pt, NULL);
+      return;
+   }
+
+   /* Create a second sampler view to read stencil.  The stencil is
+    * written using the shader stencil export functionality.
+    */
+   if (write_stencil) {
+      enum pipe_format stencil_format =
+         util_format_stencil_only(pt->format);
+      /* we should not be doing pixel map/transfer (see above) */
+      assert(num_sampler_view == 1);
+      sv[1] = st_create_texture_sampler_view_format(st->pipe, pt,
+                                                    stencil_format);
+      if (!sv[1]) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
+         pipe_resource_reference(&pt, NULL);
+         pipe_sampler_view_reference(&sv[0], NULL);
+         return;
+      }
+      num_sampler_view++;
+   }
+
+   draw_textured_quad(ctx, x, y, ctx->Current.RasterPos[2],
+                      width, height,
+                      ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
+                      sv,
+                      num_sampler_view,
+                      driver_vp,
+                      driver_fp, fpv,
+                      color, GL_FALSE, write_depth, write_stencil);
+   pipe_sampler_view_reference(&sv[0], NULL);
+   if (num_sampler_view > 1)
+      pipe_sampler_view_reference(&sv[1], NULL);
+
+   pipe_resource_reference(&pt, NULL);
 }
 
 

From cf405922eb2bd4d1dfae896caa9d58980875e7ec Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 09:54:29 -0600
Subject: [PATCH 26/85] mesa: make memcpy_texture() non-static

So that we can use it directly from the mesa/gallium state tracker.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/main/texstore.c | 40 ++++++++++++++++++++--------------------
 src/mesa/main/texstore.h | 11 +++++++++++
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index e50964e79e4..4b13c42ed74 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -97,16 +97,16 @@ static const GLubyte map_1032[6] = { 1, 0, 3, 2, ZERO, ONE };
  * No pixel transfer operations or special texel encodings allowed.
  * 1D, 2D and 3D images supported.
  */
-static void
-memcpy_texture(struct gl_context *ctx,
-	       GLuint dimensions,
-               mesa_format dstFormat,
-               GLint dstRowStride,
-               GLubyte **dstSlices,
-               GLint srcWidth, GLint srcHeight, GLint srcDepth,
-               GLenum srcFormat, GLenum srcType,
-               const GLvoid *srcAddr,
-               const struct gl_pixelstore_attrib *srcPacking)
+void
+_mesa_memcpy_texture(struct gl_context *ctx,
+                     GLuint dimensions,
+                     mesa_format dstFormat,
+                     GLint dstRowStride,
+                     GLubyte **dstSlices,
+                     GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                     GLenum srcFormat, GLenum srcType,
+                     const GLvoid *srcAddr,
+                     const struct gl_pixelstore_attrib *srcPacking)
 {
    const GLint srcRowStride = _mesa_image_row_stride(srcPacking, srcWidth,
                                                      srcFormat, srcType);
@@ -296,11 +296,11 @@ _mesa_texstore_ycbcr(TEXSTORE_PARAMS)
    assert(baseInternalFormat == GL_YCBCR_MESA);
 
    /* always just memcpy since no pixel transfer ops apply */
-   memcpy_texture(ctx, dims,
-                  dstFormat,
-                  dstRowStride, dstSlices,
-                  srcWidth, srcHeight, srcDepth, srcFormat, srcType,
-                  srcAddr, srcPacking);
+   _mesa_memcpy_texture(ctx, dims,
+                        dstFormat,
+                        dstRowStride, dstSlices,
+                        srcWidth, srcHeight, srcDepth, srcFormat, srcType,
+                        srcAddr, srcPacking);
 
    /* Check if we need byte swapping */
    /* XXX the logic here _might_ be wrong */
@@ -899,11 +899,11 @@ _mesa_texstore_memcpy(TEXSTORE_PARAMS)
       return GL_FALSE;
    }
 
-   memcpy_texture(ctx, dims,
-                  dstFormat,
-                  dstRowStride, dstSlices,
-                  srcWidth, srcHeight, srcDepth, srcFormat, srcType,
-                  srcAddr, srcPacking);
+   _mesa_memcpy_texture(ctx, dims,
+                        dstFormat,
+                        dstRowStride, dstSlices,
+                        srcWidth, srcHeight, srcDepth, srcFormat, srcType,
+                        srcAddr, srcPacking);
    return GL_TRUE;
 }
 /**
diff --git a/src/mesa/main/texstore.h b/src/mesa/main/texstore.h
index 2c974f74afb..f08dc08edde 100644
--- a/src/mesa/main/texstore.h
+++ b/src/mesa/main/texstore.h
@@ -74,6 +74,17 @@ _mesa_texstore_needs_transfer_ops(struct gl_context *ctx,
                                   GLenum baseInternalFormat,
                                   mesa_format dstFormat);
 
+extern void
+_mesa_memcpy_texture(struct gl_context *ctx,
+                     GLuint dimensions,
+                     mesa_format dstFormat,
+                     GLint dstRowStride,
+                     GLubyte **dstSlices,
+                     GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                     GLenum srcFormat, GLenum srcType,
+                     const GLvoid *srcAddr,
+                     const struct gl_pixelstore_attrib *srcPacking);
+
 extern GLboolean
 _mesa_texstore_can_use_memcpy(struct gl_context *ctx,
                               GLenum baseInternalFormat, mesa_format dstFormat,

From d11fefa96165836ffeed531a74319a64aa98a696 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 11:54:06 -0600
Subject: [PATCH 27/85] st/mesa: optimize 4-component ubyte glDrawPixels

If we didn't find a gallium surface format that exactly matched the
glDrawPixels format/type combination, we used some other 32-bit packed
RGBA format and swizzled the whole image in the mesa texstore/format code.

That slow path can be avoided in some common cases by using the
pipe_samper_view's swizzle terms to do the swizzling at texture sampling
time instead.

For now, only GL_RGBA/ubyte and GL_BGRA/ubyte combinations are supported.
In the future other formats and types like GL_UNSIGNED_INT_8_8_8_8 could
be added.

v2: fix incorrect swizzle setup (need to invert the tex format's swizzle)

Reviewed by: Jose Fonseca <jfonseca@vmware.com>
---
 src/mesa/state_tracker/st_cb_drawpixels.c | 104 ++++++++++++++++++++--
 1 file changed, 95 insertions(+), 9 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index de7d1f6489a..262ad809c58 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -395,15 +395,35 @@ make_texture(struct st_context *st,
        * Note that the image is actually going to be upside down in
        * the texture.  We deal with that with texcoords.
        */
-      success = _mesa_texstore(ctx, 2,           /* dims */
-                               baseInternalFormat, /* baseInternalFormat */
-                               mformat,          /* mesa_format */
-                               transfer->stride, /* dstRowStride, bytes */
-                               &dest,            /* destSlices */
-                               width, height, 1, /* size */
-                               format, type,     /* src format/type */
-                               pixels,           /* data source */
-                               unpack);
+      if ((format == GL_RGBA || format == GL_BGRA)
+          && type == GL_UNSIGNED_BYTE) {
+         /* Use a memcpy-based texstore to avoid software pixel swizzling.
+          * We'll do the necessary swizzling with the pipe_sampler_view to
+          * give much better performance.
+          * XXX in the future, expand this to accomodate more format and
+          * type combinations.
+          */
+         _mesa_memcpy_texture(ctx, 2,
+                              mformat,          /* mesa_format */
+                              transfer->stride, /* dstRowStride, bytes */
+                              &dest,            /* destSlices */
+                              width, height, 1, /* size */
+                              format, type,     /* src format/type */
+                              pixels,           /* data source */
+                              unpack);
+         success = GL_TRUE;
+      }
+      else {
+         success = _mesa_texstore(ctx, 2,           /* dims */
+                                  baseInternalFormat, /* baseInternalFormat */
+                                  mformat,          /* mesa_format */
+                                  transfer->stride, /* dstRowStride, bytes */
+                                  &dest,            /* destSlices */
+                                  width, height, 1, /* size */
+                                  format, type,     /* src format/type */
+                                  pixels,           /* data source */
+                                  unpack);
+      }
 
       /* unmap */
       pipe_transfer_unmap(pipe, transfer);
@@ -957,6 +977,69 @@ clamp_size(struct pipe_context *pipe, GLsizei *width, GLsizei *height,
 }
 
 
+/**
+ * Search the array of 4 swizzle components for the named component and return
+ * its position.
+ */
+static unsigned
+search_swizzle(const unsigned char swizzle[4], unsigned component)
+{
+   unsigned i;
+   for (i = 0; i < 4; i++) {
+      if (swizzle[i] == component)
+         return i;
+   }
+   assert(!"search_swizzle() failed");
+   return 0;
+}
+
+
+/**
+ * Set the sampler view's swizzle terms.  This is used to handle RGBA
+ * swizzling when the incoming image format isn't an exact match for
+ * the actual texture format.  For example, if we have glDrawPixels(
+ * GL_RGBA, GL_UNSIGNED_BYTE) and we chose the texture format
+ * PIPE_FORMAT_B8G8R8A8 then we can do use the sampler view swizzle to
+ * avoid swizzling all the pixels in software in the texstore code.
+ */
+static void
+setup_sampler_swizzle(struct pipe_sampler_view *sv, GLenum format, GLenum type)
+{
+   if ((format == GL_RGBA || format == GL_BGRA) && type == GL_UNSIGNED_BYTE) {
+      const struct util_format_description *desc =
+         util_format_description(sv->texture->format);
+      unsigned c0, c1, c2, c3;
+
+      /* Every gallium driver supports at least one 32-bit packed RGBA format.
+       * We must have chosen one for (GL_RGBA, GL_UNSIGNED_BYTE).
+       */
+      assert(desc->block.bits == 32);
+
+      /* invert the format's swizzle to setup the sampler's swizzle */
+      if (format == GL_RGBA) {
+         c0 = UTIL_FORMAT_SWIZZLE_X;
+         c1 = UTIL_FORMAT_SWIZZLE_Y;
+         c2 = UTIL_FORMAT_SWIZZLE_Z;
+         c3 = UTIL_FORMAT_SWIZZLE_W;
+      }
+      else {
+         assert(format == GL_BGRA);
+         c0 = UTIL_FORMAT_SWIZZLE_Z;
+         c1 = UTIL_FORMAT_SWIZZLE_Y;
+         c2 = UTIL_FORMAT_SWIZZLE_X;
+         c3 = UTIL_FORMAT_SWIZZLE_W;
+      }
+      sv->swizzle_r = search_swizzle(desc->swizzle, c0);
+      sv->swizzle_g = search_swizzle(desc->swizzle, c1);
+      sv->swizzle_b = search_swizzle(desc->swizzle, c2);
+      sv->swizzle_a = search_swizzle(desc->swizzle, c3);
+   }
+   else {
+      /* use the default sampler swizzle */
+   }
+}
+
+
 /**
  * Called via ctx->Driver.DrawPixels()
  */
@@ -1046,6 +1129,9 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
       return;
    }
 
+   /* Set up the sampler view's swizzle */
+   setup_sampler_swizzle(sv[0], format, type);
+
    /* Create a second sampler view to read stencil.  The stencil is
     * written using the shader stencil export functionality.
     */

From f6d4e20d10d2316b70b73676e97b2c1e5cf7634a Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 12:33:00 -0600
Subject: [PATCH 28/85] vbo: reduce number of vertex buffer mappings for vertex
 attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whenever we got a glColor, glNormal, glTexCoord, etc. call outside a
glBegin/End pair, we'd immediately map a vertex buffer to begin
accumulating vertex data.  In some cases, such as with display lists,
this led to excessive vertex buffer mapping.  For example, if we have
a display list such as:

glNewList(42, GL_COMPILE);
glBegin(prim);
glVertex2f();
...
glVertex2f();
glEnd();
glEndList();

Then did:

glColor3f();
glCallList(42);

We'd map a vertex buffer as soon as we saw glColor3f but we'd never
actually write anything to it.  Note that the vertex position data
was put into a vertex buffer during display list compilation.

With this change, we delay mapping the vertex buffer until we actually
have a vertex to write to it (triggered by a glVertex() call).  In the
above case, we no longer map a vertex buffer when setting the color and
calling the list.

For drivers such as VMware's, reducing buffer mappings gives improved
performance.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/vbo/vbo_exec_api.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 7ae08fe3062..789869a9790 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -446,10 +446,6 @@ do {									\
                                                                         \
    assert(sz == 1 || sz == 2);                                          \
                                                                         \
-   if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) {     \
-      vbo_exec_begin_vertices(ctx);					\
-   }									\
-                                                                        \
    /* check if attribute size or type is changing */                    \
    if (unlikely(exec->vtx.active_sz[A] != N * sz) ||                    \
        unlikely(exec->vtx.attrtype[A] != T)) {                          \
@@ -470,6 +466,15 @@ do {									\
       /* This is a glVertex call */					\
       GLuint i;								\
 									\
+      if (unlikely((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0)) { \
+         vbo_exec_begin_vertices(ctx);                                  \
+      }                                                                 \
+                                                                        \
+      if (unlikely(!exec->vtx.buffer_ptr)) {                            \
+         vbo_exec_vtx_map(exec);                                        \
+      }                                                                 \
+      assert(exec->vtx.buffer_ptr);                                     \
+                                                                        \
       /* copy 32-bit words */                                           \
       for (i = 0; i < exec->vtx.vertex_size; i++)			\
 	 exec->vtx.buffer_ptr[i] = exec->vtx.vertex[i];			\
@@ -482,7 +487,10 @@ do {									\
 									\
       if (++exec->vtx.vert_count >= exec->vtx.max_vert)			\
 	 vbo_exec_vtx_wrap( exec );					\
-   }									\
+   } else {                                                             \
+      /* we now have accumulated per-vertex attributes */               \
+      ctx->Driver.NeedFlush |= FLUSH_UPDATE_CURRENT;                    \
+   }                                                                    \
 } while (0)
 
 #define ERROR(err) _mesa_error( ctx, err, __func__ )

From f7272032bec2e92e05e9c870e9655ca069d3d988 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 09:52:09 -0600
Subject: [PATCH 29/85] mesa: simple whitespace fix in texstore.c

---
 src/mesa/main/texstore.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 4b13c42ed74..d7671738b18 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -906,6 +906,8 @@ _mesa_texstore_memcpy(TEXSTORE_PARAMS)
                         srcAddr, srcPacking);
    return GL_TRUE;
 }
+
+
 /**
  * Store user data into texture memory.
  * Called via glTex[Sub]Image1/2/3D()

From 6cc596c66bb41cd3fa60bbf630c9ea4f661a64cc Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 22:30:22 -0600
Subject: [PATCH 30/85] tnl: add some comments in render_line_loop code

And remove '(void) flags' line which is not needed.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/tnl/t_vb_rendertmp.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/tnl/t_vb_rendertmp.h b/src/mesa/tnl/t_vb_rendertmp.h
index 44dee763594..4bfc6b15d3b 100644
--- a/src/mesa/tnl/t_vb_rendertmp.h
+++ b/src/mesa/tnl/t_vb_rendertmp.h
@@ -124,19 +124,19 @@ static void TAG(render_line_loop)( struct gl_context *ctx,
    GLuint i;
    LOCAL_VARS;
 
-   (void) flags;
-
    INIT(GL_LINE_LOOP);
 
    if (start+1 < count) {
       if (TEST_PRIM_BEGIN(flags)) {
 	 RESET_STIPPLE;
+         /* draw the first line from v[0] to v[1] */
          if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION_EXT)
             RENDER_LINE( ELT(start), ELT(start+1) );
          else
             RENDER_LINE( ELT(start+1), ELT(start) );
       }
 
+      /* draw lines from v[1] to v[n-1] */
       for ( i = start+2 ; i < count ; i++) {
          if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION_EXT)
             RENDER_LINE( ELT(i-1), ELT(i) );
@@ -145,6 +145,7 @@ static void TAG(render_line_loop)( struct gl_context *ctx,
       }
 
       if ( TEST_PRIM_END(flags)) {
+         /* draw final line from v[n-1] to v[0] (the very first vertex) */
          if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION_EXT)
             RENDER_LINE( ELT(count-1), ELT(start) );
          else

From 971b56c643f35a2fb2f0f21cd5fd45ce9b155d4b Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 14:11:54 -0600
Subject: [PATCH 31/85] vbo: remove unneeded ctx parameter for merge_prims()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_save_api.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index fdc677f9a07..6688ba0d797 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -330,8 +330,7 @@ _save_reset_counters(struct gl_context *ctx)
  * previous prim.
  */
 static void
-merge_prims(struct gl_context *ctx,
-            struct _mesa_prim *prim_list,
+merge_prims(struct _mesa_prim *prim_list,
             GLuint *prim_count)
 {
    GLuint i;
@@ -442,7 +441,7 @@ _save_compile_vertex_list(struct gl_context *ctx)
     */
    save->copied.nr = _save_copy_vertices(ctx, node, save->buffer);
 
-   merge_prims(ctx, node->prim, &node->prim_count);
+   merge_prims(node->prim, &node->prim_count);
 
    /* Deal with GL_COMPILE_AND_EXECUTE:
     */

From e05ffcf1d94d01da37b4f488aa05716c62ff6547 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 20:22:25 -0600
Subject: [PATCH 32/85] vbo: make vbo_exec_vtx_wrap() static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_exec.h     | 2 --
 src/mesa/vbo/vbo_exec_api.c | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index 00378eb7984..a80b2c908d1 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -160,8 +160,6 @@ void vbo_exec_vtx_flush( struct vbo_exec_context *exec, GLboolean unmap );
 void vbo_exec_vtx_map( struct vbo_exec_context *exec );
 
 
-void vbo_exec_vtx_wrap( struct vbo_exec_context *exec );
-
 void vbo_exec_eval_update( struct vbo_exec_context *exec );
 
 void vbo_exec_do_EvalCoord2f( struct vbo_exec_context *exec, 
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 789869a9790..c1f2146aad8 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -113,7 +113,8 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
  * Deal with buffer wrapping where provoked by the vertex buffer
  * filling up, as opposed to upgrade_vertex().
  */
-void vbo_exec_vtx_wrap( struct vbo_exec_context *exec )
+static void
+vbo_exec_vtx_wrap(struct vbo_exec_context *exec)
 {
    fi_type *data = exec->vtx.copied.buffer;
    GLuint i;

From 1637cec8f894f80937fe7c1b1f4fe4d245d6005b Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 21:21:56 -0600
Subject: [PATCH 33/85] vbo: replace the comment on vbo_copy_vertices()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_exec_draw.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 174cbc37c26..781991bd0bf 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -64,9 +64,13 @@ vbo_exec_debug_verts( struct vbo_exec_context *exec )
 }
 
 
-/*
- * NOTE: Need to have calculated primitives by this point -- do it on the fly.
- * NOTE: Old 'parity' issue is gone.
+/**
+ * Copy zero, one or two vertices from the current vertex buffer into
+ * the temporary "copy" buffer.
+ * This is used when a single primitive overflows a vertex buffer and
+ * we need to continue the primitive in a new vertex buffer.
+ * The temporary "copy" buffer holds the vertices which need to get
+ * copied from the old buffer to the new one.
  */
 static GLuint
 vbo_copy_vertices( struct vbo_exec_context *exec )

From d24c3a680e9282c11bd411d0c4dbcff561c0f4ca Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 20:18:32 -0600
Subject: [PATCH 34/85] vbo: simplify some code in vbo_exec_wrap_buffers()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use a new 'last_prim' pointer to simplify things.

v2: remove unneeded assert(exec->vtx.prim_count > 0)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_exec_api.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index c1f2146aad8..f519f8a4e61 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -71,17 +71,15 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
       exec->vtx.buffer_ptr = exec->vtx.buffer_map;
    }
    else {
-      GLuint last_begin = exec->vtx.prim[exec->vtx.prim_count-1].begin;
+      struct _mesa_prim *last_prim = &exec->vtx.prim[exec->vtx.prim_count - 1];
+      const GLuint last_begin = last_prim->begin;
       GLuint last_count;
 
       if (_mesa_inside_begin_end(exec->ctx)) {
-	 GLint i = exec->vtx.prim_count - 1;
-	 assert(i >= 0);
-	 exec->vtx.prim[i].count = (exec->vtx.vert_count - 
-				    exec->vtx.prim[i].start);
+	 last_prim->count = exec->vtx.vert_count - last_prim->start;
       }
 
-      last_count = exec->vtx.prim[exec->vtx.prim_count-1].count;
+      last_count = last_prim->count;
 
       /* Execute the buffer and save copied vertices.
        */

From d916175c4d965942325bbb4a684fac45fb6ab9e2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 21:25:18 -0600
Subject: [PATCH 35/85] vbo: simplify some code in vbo_copy_vertices()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As before, use a new 'last_prim' pointer to simplify things.  Plus, add
some const qualifiers.

v2: use 'sz' in another place, per Sinclair.  And update subject line.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_exec_draw.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 781991bd0bf..9b1103dad72 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -75,13 +75,12 @@ vbo_exec_debug_verts( struct vbo_exec_context *exec )
 static GLuint
 vbo_copy_vertices( struct vbo_exec_context *exec )
 {
-   GLuint nr = exec->vtx.prim[exec->vtx.prim_count-1].count;
+   struct _mesa_prim *last_prim = &exec->vtx.prim[exec->vtx.prim_count - 1];
+   const GLuint nr = last_prim->count;
    GLuint ovf, i;
-   GLuint sz = exec->vtx.vertex_size;
+   const GLuint sz = exec->vtx.vertex_size;
    fi_type *dst = exec->vtx.copied.buffer;
-   const fi_type *src = (exec->vtx.buffer_map +
-                         exec->vtx.prim[exec->vtx.prim_count-1].start * 
-                         exec->vtx.vertex_size);
+   const fi_type *src = exec->vtx.buffer_map + last_prim->start * sz;
 
    switch (exec->ctx->Driver.CurrentExecPrimitive) {
    case GL_POINTS:
@@ -127,7 +126,7 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
    case GL_TRIANGLE_STRIP:
       /* no parity issue, but need to make sure the tri is not drawn twice */
       if (nr & 1) {
-	 exec->vtx.prim[exec->vtx.prim_count-1].count--;
+	 last_prim->count--;
       }
       /* fallthrough */
    case GL_QUAD_STRIP:

From 002c5c1da3a2db60607fb184b9c6343415987fb5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 20:33:52 -0600
Subject: [PATCH 36/85] vbo: simplify some code in vbo_exec_End()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_exec_api.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index f519f8a4e61..3f87ac44265 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -821,11 +821,10 @@ static void GLAPIENTRY vbo_exec_End( void )
 
    if (exec->vtx.prim_count > 0) {
       /* close off current primitive */
-      int idx = exec->vtx.vert_count;
-      int i = exec->vtx.prim_count - 1;
+      struct _mesa_prim *last_prim = &exec->vtx.prim[exec->vtx.prim_count - 1];
 
-      exec->vtx.prim[i].end = 1;
-      exec->vtx.prim[i].count = idx - exec->vtx.prim[i].start;
+      last_prim->end = 1;
+      last_prim->count = exec->vtx.vert_count - last_prim->start;
 
       try_vbo_merge(exec);
    }

From 03d2f085394011f704f5702a92128b5677733c38 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 16 Oct 2015 11:19:40 -0600
Subject: [PATCH 37/85] vbo: add new vbo_compute_max_verts() helper function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_context.h   | 14 ++++++++++++++
 src/mesa/vbo/vbo_exec_api.c  |  3 +--
 src/mesa/vbo/vbo_exec_draw.c |  3 +--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index a376efe34a7..1e85335c107 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -196,6 +196,20 @@ vbo_get_default_vals_as_union(GLenum format)
    }
 }
 
+
+/**
+ * Compute the max number of vertices which can be stored in
+ * a vertex buffer, given the current vertex size, and the amount
+ * of space already used.
+ */
+static inline unsigned
+vbo_compute_max_verts(const struct vbo_exec_context *exec)
+{
+   return (VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
+          (exec->vtx.vertex_size * sizeof(GLfloat));
+}
+
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 3f87ac44265..f26bf405d56 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -291,8 +291,7 @@ vbo_exec_wrap_upgrade_vertex(struct vbo_exec_context *exec,
     */
    exec->vtx.attrsz[attr] = newSize;
    exec->vtx.vertex_size += newSize - oldSize;
-   exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / 
-                         (exec->vtx.vertex_size * sizeof(GLfloat)));
+   exec->vtx.max_vert = vbo_compute_max_verts(exec);
    exec->vtx.vert_count = 0;
    exec->vtx.buffer_ptr = exec->vtx.buffer_map;
 
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 9b1103dad72..f6a1e4bdfad 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -435,8 +435,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
    if (keepUnmapped || exec->vtx.vertex_size == 0)
       exec->vtx.max_vert = 0;
    else
-      exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
-                            (exec->vtx.vertex_size * sizeof(GLfloat)));
+      exec->vtx.max_vert = vbo_compute_max_verts(exec);
 
    exec->vtx.buffer_ptr = exec->vtx.buffer_map;
    exec->vtx.prim_count = 0;

From d79595bf0230824b241545c0a0bd2294525df088 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 22:31:50 -0600
Subject: [PATCH 38/85] vbo: fix GL_LINE_LOOP stray line bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When long GL_LINE_LOOP primitives don't fit in one vertex buffer they
have to be split across buffers.  The code to do this was basically correct
but drivers had to pay special attention to the _mesa_prim::begin,end flags
in order to draw the sections of the line loop properly.  Apparently, the
only drivers to do this were those using the old 'tnl' module for software
vertex processing.

Now we convert the split pieces of GL_LINE_LOOP prims into GL_LINE_STRIP
primitives so that drivers don't have to worry about the special begin/end
flags.  The only time a driver will get a GL_LINE_LOOP prim is when the
whole thing fits in one vertex buffer.

Mostly fixes bug 81174, but not completely.  There's another bug somewhere
in the src/gallium/auxiliary/draw/ code.  If the piglit lineloop test is
run with -count 4096, rendering is correct, but with -count 4097 there are
stray lines.  4096 is a magic number in the draw code (search for "4096").

Also note that this does not fix long line loops in display lists.  The
next patch fixes that.

v2: fix incorrect -1 in vbo_compute_max_verts(), per Charmaine.  Remove
incorrect assertion which was added in vbo_copy_vertices().

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=81174
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=49779
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=28130

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_context.h   | 10 ++++++++--
 src/mesa/vbo/vbo_exec_api.c  | 38 +++++++++++++++++++++++++++++++++++-
 src/mesa/vbo/vbo_exec_draw.c | 11 +++++++++++
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index 1e85335c107..e6b9d890d5f 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -205,8 +205,14 @@ vbo_get_default_vals_as_union(GLenum format)
 static inline unsigned
 vbo_compute_max_verts(const struct vbo_exec_context *exec)
 {
-   return (VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
-          (exec->vtx.vertex_size * sizeof(GLfloat));
+   unsigned n = (VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
+      (exec->vtx.vertex_size * sizeof(GLfloat));
+   assert(n > 0);
+   /* Subtract one so we're always sure to have room for an extra
+    * vertex for GL_LINE_LOOP -> GL_LINE_STRIP conversion.
+    */
+   n--;
+   return n;
 }
 
 
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index f26bf405d56..a23d5aa08aa 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -61,7 +61,8 @@ static void reset_attrfv( struct vbo_exec_context *exec );
 
 /**
  * Close off the last primitive, execute the buffer, restart the
- * primitive.  
+ * primitive.  This is called when we fill a vertex buffer before
+ * hitting glEnd.
  */
 static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
 {
@@ -81,6 +82,22 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
 
       last_count = last_prim->count;
 
+      /* Special handling for wrapping GL_LINE_LOOP */
+      if (last_prim->mode == GL_LINE_LOOP &&
+          last_count > 0 &&
+          !last_prim->end) {
+         /* draw this section of the incomplete line loop as a line strip */
+         last_prim->mode = GL_LINE_STRIP;
+         if (!last_prim->begin) {
+            /* This is not the first section of the line loop, so don't
+             * draw the 0th vertex.  We're saving it until we draw the
+             * very last section of the loop.
+             */
+            last_prim->start++;
+            last_prim->count--;
+         }
+      }
+
       /* Execute the buffer and save copied vertices.
        */
       if (exec->vtx.vert_count)
@@ -96,6 +113,7 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
 
       if (_mesa_inside_begin_end(exec->ctx)) {
 	 exec->vtx.prim[0].mode = exec->ctx->Driver.CurrentExecPrimitive;
+	 exec->vtx.prim[0].begin = 0;
 	 exec->vtx.prim[0].start = 0;
 	 exec->vtx.prim[0].count = 0;
 	 exec->vtx.prim_count++;
@@ -825,6 +843,24 @@ static void GLAPIENTRY vbo_exec_End( void )
       last_prim->end = 1;
       last_prim->count = exec->vtx.vert_count - last_prim->start;
 
+      /* Special handling for GL_LINE_LOOP */
+      if (last_prim->mode == GL_LINE_LOOP && last_prim->begin == 0) {
+         /* We're finishing drawing a line loop.  Append 0th vertex onto
+          * end of vertex buffer so we can draw it as a line strip.
+          */
+         const fi_type *src = exec->vtx.buffer_map;
+         fi_type *dst = exec->vtx.buffer_map +
+            exec->vtx.vert_count * exec->vtx.vertex_size;
+
+         /* copy 0th vertex to end of buffer */
+         memcpy(dst, src, exec->vtx.vertex_size * sizeof(fi_type));
+
+         assert(last_prim->start == 0);
+         last_prim->start++;  /* skip vertex0 */
+         /* note that last_prim->count stays unchanged */
+         last_prim->mode = GL_LINE_STRIP;
+      }
+
       try_vbo_merge(exec);
    }
 
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index f6a1e4bdfad..ed5d9e947b0 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -109,6 +109,17 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
 	 return 1;
       }
    case GL_LINE_LOOP:
+      if (last_prim->begin == 0) {
+         /* We're dealing with the second or later section of a split/wrapped
+          * GL_LINE_LOOP.  Since we're converting line loops to line strips,
+          * we've already increment the last_prim->start counter by one to
+          * skip the 0th vertex in the loop.  We need to undo that (effectively
+          * subtract one from last_prim->start) so that we copy the 0th vertex
+          * to the next vertex buffer.
+          */
+         src -= sz;
+      }
+      /* fall-through */
    case GL_TRIANGLE_FAN:
    case GL_POLYGON:
       if (nr == 0) {

From f2215809377234aa2073502587e1803c8952bea3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 15 Oct 2015 22:57:08 -0600
Subject: [PATCH 39/85] vbo: convert display list GL_LINE_LOOP prims to
 GL_LINE_STRIP

When a long GL_LINE_LOOP prim was split across primitives we drew
stray lines.  See previous commit for details.

This patch converts GL_LINE_LOOP prims into GL_LINE_STRIP prims so
that drivers don't have to worry about the _mesa_prim::begin/end flags.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=81174

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Acked-by: Sinclair Yeh <syeh@vmware.com>
---
 src/mesa/vbo/vbo_save_api.c | 53 +++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index 6688ba0d797..d49aa15b1b7 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -360,6 +360,51 @@ merge_prims(struct _mesa_prim *prim_list,
    *prim_count = prev_prim - prim_list + 1;
 }
 
+
+/**
+ * Convert GL_LINE_LOOP primitive into GL_LINE_STRIP so that drivers
+ * don't have to worry about handling the _mesa_prim::begin/end flags.
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=81174
+ */
+static void
+convert_line_loop_to_strip(struct vbo_save_context *save,
+                           struct vbo_save_vertex_list *node)
+{
+   struct _mesa_prim *prim = &node->prim[node->prim_count - 1];
+
+   assert(prim->mode == GL_LINE_LOOP);
+
+   if (prim->end) {
+      /* Copy the 0th vertex to end of the buffer and extend the
+       * vertex count by one to finish the line loop.
+       */
+      const GLuint sz = save->vertex_size;
+      /* 0th vertex: */
+      const fi_type *src = save->buffer + prim->start * sz;
+      /* end of buffer: */
+      fi_type *dst = save->buffer + (prim->start + prim->count) * sz;
+
+      memcpy(dst, src, sz * sizeof(float));
+
+      prim->count++;
+      node->count++;
+      save->vert_count++;
+      save->buffer_ptr += sz;
+      save->vertex_store->used += sz;
+   }
+
+   if (!prim->begin) {
+      /* Drawing the second or later section of a long line loop.
+       * Skip the 0th vertex.
+       */
+      prim->start++;
+      prim->count--;
+   }
+
+   prim->mode = GL_LINE_STRIP;
+}
+
+
 /**
  * Insert the active immediate struct onto the display list currently
  * being built.
@@ -441,6 +486,10 @@ _save_compile_vertex_list(struct gl_context *ctx)
     */
    save->copied.nr = _save_copy_vertices(ctx, node, save->buffer);
 
+   if (node->prim[node->prim_count - 1].mode == GL_LINE_LOOP) {
+      convert_line_loop_to_strip(save, node);
+   }
+
    merge_prims(node->prim, &node->prim_count);
 
    /* Deal with GL_COMPILE_AND_EXECUTE:
@@ -482,6 +531,10 @@ _save_compile_vertex_list(struct gl_context *ctx)
       save->buffer_ptr = vbo_save_map_vertex_store(ctx, save->vertex_store);
       save->out_of_memory = save->buffer_ptr == NULL;
    }
+   else {
+      /* update buffer_ptr for next vertex */
+      save->buffer_ptr = save->vertex_store->buffer + save->vertex_store->used;
+   }
 
    if (save->prim_store->used > VBO_SAVE_PRIM_SIZE - 6) {
       save->prim_store->refcount--;

From a5a00bd7472dc8876e1bec9a7172af7d332ac95e Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 23 Sep 2015 16:12:26 -0700
Subject: [PATCH 40/85] i965/gen9: Reuse YF alignment tables in
 tr_mode_..._texture_alignment()

Patch just does some refactoring to make the code look better. No
functional changes in here.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 46 +++++++++++-----------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 67628c96d20..2ce3f71424a 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -43,23 +43,19 @@ static unsigned int
 tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
                                      const struct intel_mipmap_tree *mt)
 {
-   const unsigned *align_yf, *align_ys;
+   const unsigned *align_yf;
    const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
-   unsigned ret_align, divisor;
+   unsigned ret_align, divisor, multiplier_ys;
 
-   /* Horizontal alignment tables for TRMODE_{YF,YS}. Value in below
-    * tables specifies the horizontal alignment requirement in elements
-    * for the surface. An element is defined as a pixel in uncompressed
-    * surface formats, and as a compression block in compressed surface
-    * formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
+   /* Values in below tables specifiy the horizontal alignment requirement
+    * in elements for TRMODE_YF surface. An element is defined as a pixel in
+    * uncompressed surface formats, and as a compression block in compressed
+    * surface formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
     * element is a sample.
     */
    const unsigned align_1d_yf[] = {4096, 2048, 1024, 512, 256};
-   const unsigned align_1d_ys[] = {65536, 32768, 16384, 8192, 4096};
    const unsigned align_2d_yf[] = {64, 64, 32, 32, 16};
-   const unsigned align_2d_ys[] = {256, 256, 128, 128, 64};
    const unsigned align_3d_yf[] = {16, 8, 8, 8, 4};
-   const unsigned align_3d_ys[] = {64, 32, 32, 32, 16};
    int i = 0;
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
@@ -69,7 +65,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
       align_yf = align_1d_yf;
-      align_ys = align_1d_ys;
+      multiplier_ys = 16;
       break;
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE:
@@ -79,11 +75,11 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       align_yf = align_2d_yf;
-      align_ys = align_2d_ys;
+      multiplier_ys = 4;
       break;
    case GL_TEXTURE_3D:
       align_yf = align_3d_yf;
-      align_ys = align_3d_ys;
+      multiplier_ys = 4;
       break;
    default:
       unreachable("not reached");
@@ -92,8 +88,10 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    /* Compute array index. */
    i = ffs(bpp/8) - 1;
 
-   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
-               align_yf[i] : align_ys[i];
+   ret_align = align_yf[i];
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      ret_align *= multiplier_ys;
 
    assert(_mesa_is_pow_two(mt->num_samples));
 
@@ -151,15 +149,13 @@ static unsigned int
 tr_mode_vertical_texture_alignment(const struct brw_context *brw,
                                    const struct intel_mipmap_tree *mt)
 {
-   const unsigned *align_yf, *align_ys;
+   const unsigned *align_yf;
    const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
-   unsigned ret_align, divisor;
+   unsigned ret_align, divisor, multiplier_ys;
 
-   /* Vertical alignment tables for TRMODE_YF and TRMODE_YS. */
+   /* Vertical alignment tables for TRMODE_YF */
    const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
-   const unsigned align_2d_ys[] = {256, 128, 128, 64, 64};
    const unsigned align_3d_yf[] = {16, 16, 16, 8, 8};
-   const unsigned align_3d_ys[] = {32, 32, 32, 16, 16};
    int i = 0;
 
    assert(brw->gen >= 9);
@@ -176,11 +172,11 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       align_yf = align_2d_yf;
-      align_ys = align_2d_ys;
+      multiplier_ys = 4;
       break;
    case GL_TEXTURE_3D:
       align_yf = align_3d_yf;
-      align_ys = align_3d_ys;
+      multiplier_ys = 2;
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
@@ -191,8 +187,10 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    /* Compute array index. */
    i = ffs(bpp / 8) - 1;
 
-   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
-               align_yf[i] : align_ys[i];
+   ret_align = align_yf[i];
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      ret_align *= multiplier_ys;
 
    assert(_mesa_is_pow_two(mt->num_samples));
 

From 8f8c450bc7c1652b8c76f7e716273f0f784d30c0 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 23 Sep 2015 16:13:00 -0700
Subject: [PATCH 41/85] i965/gen9: Remove parameter 'brw' from
 tr_mode_..._texture_alignment()

V2: Rebased on master.

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 2ce3f71424a..f1aeae9d12a 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -40,8 +40,7 @@
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
 static unsigned int
-tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
-                                     const struct intel_mipmap_tree *mt)
+tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
 {
    const unsigned *align_yf;
    const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
@@ -58,6 +57,8 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    const unsigned align_3d_yf[] = {16, 8, 8, 8, 4};
    int i = 0;
 
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
+
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
    assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
 
@@ -146,8 +147,7 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
 }
 
 static unsigned int
-tr_mode_vertical_texture_alignment(const struct brw_context *brw,
-                                   const struct intel_mipmap_tree *mt)
+tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
 {
    const unsigned *align_yf;
    const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
@@ -158,7 +158,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    const unsigned align_3d_yf[] = {16, 16, 16, 8, 8};
    int i = 0;
 
-   assert(brw->gen >= 9);
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
    assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
@@ -777,8 +777,8 @@ intel_miptree_set_alignment(struct brw_context *brw,
    } else if (brw->gen >= 9 && mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
       /* XY_FAST_COPY_BLT doesn't support horizontal alignment < 32 or
        * vertical alignment < 64. */
-      mt->halign = MAX2(tr_mode_horizontal_texture_alignment(brw, mt), 32);
-      mt->valign = MAX2(tr_mode_vertical_texture_alignment(brw, mt), 64);
+      mt->halign = MAX2(tr_mode_horizontal_texture_alignment(mt), 32);
+      mt->valign = MAX2(tr_mode_vertical_texture_alignment(mt), 64);
    } else {
       mt->halign =
          intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);

From 06ec19bca4a8d9c714769c658aeb401697ab6bba Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Wed, 12 Aug 2015 16:39:05 -0700
Subject: [PATCH 42/85] i965/gen9: Remove temporary variable 'align_yf' in
 tr_mode_..._texture_alignment()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 26 ++++++++--------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index f1aeae9d12a..c7e35410210 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -42,7 +42,6 @@
 static unsigned int
 tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
 {
-   const unsigned *align_yf;
    const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
    unsigned ret_align, divisor, multiplier_ys;
 
@@ -61,11 +60,13 @@ tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
    assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   /* Compute array index. */
+   i = ffs(bpp/8) - 1;
 
    switch(mt->target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
-      align_yf = align_1d_yf;
+      ret_align = align_1d_yf[i];
       multiplier_ys = 16;
       break;
    case GL_TEXTURE_2D:
@@ -75,22 +76,17 @@ tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
    case GL_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      align_yf = align_2d_yf;
+      ret_align = align_2d_yf[i];
       multiplier_ys = 4;
       break;
    case GL_TEXTURE_3D:
-      align_yf = align_3d_yf;
+      ret_align = align_3d_yf[i];
       multiplier_ys = 4;
       break;
    default:
       unreachable("not reached");
    }
 
-   /* Compute array index. */
-   i = ffs(bpp/8) - 1;
-
-   ret_align = align_yf[i];
-
    if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
       ret_align *= multiplier_ys;
 
@@ -149,7 +145,6 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
 static unsigned int
 tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
 {
-   const unsigned *align_yf;
    const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
    unsigned ret_align, divisor, multiplier_ys;
 
@@ -162,6 +157,8 @@ tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
    assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
+   /* Compute array index. */
+   i = ffs(bpp / 8) - 1;
 
    switch(mt->target) {
    case GL_TEXTURE_2D:
@@ -171,11 +168,11 @@ tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
    case GL_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      align_yf = align_2d_yf;
+      ret_align = align_2d_yf[i];
       multiplier_ys = 4;
       break;
    case GL_TEXTURE_3D:
-      align_yf = align_3d_yf;
+      ret_align = align_3d_yf[i];
       multiplier_ys = 2;
       break;
    case GL_TEXTURE_1D:
@@ -184,11 +181,6 @@ tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
       unreachable("Unexpected miptree target");
    }
 
-   /* Compute array index. */
-   i = ffs(bpp / 8) - 1;
-
-   ret_align = align_yf[i];
-
    if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
       ret_align *= multiplier_ys;
 

From 876d07d8377bb61417ba2f443afa8b7a30b9de81 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Thu, 13 Aug 2015 11:19:47 -0700
Subject: [PATCH 43/85] i965/gen9: Remove temporary variable 'bpp' in
 tr_mode_..._texture_alignment()

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index c7e35410210..a2948293a62 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -42,7 +42,6 @@
 static unsigned int
 tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
 {
-   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
    unsigned ret_align, divisor, multiplier_ys;
 
    /* Values in below tables specifiy the horizontal alignment requirement
@@ -54,14 +53,13 @@ tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
    const unsigned align_1d_yf[] = {4096, 2048, 1024, 512, 256};
    const unsigned align_2d_yf[] = {64, 64, 32, 32, 16};
    const unsigned align_3d_yf[] = {16, 8, 8, 8, 4};
-   int i = 0;
 
    assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
 
-   /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   /* Alignment computations below assume a power of 2 cpp. */
+   assert (mt->cpp >= 1 && mt->cpp <= 16 && _mesa_is_pow_two(mt->cpp));
    /* Compute array index. */
-   i = ffs(bpp/8) - 1;
+   const int i = ffs(mt->cpp) - 1;
 
    switch(mt->target) {
    case GL_TEXTURE_1D:
@@ -145,20 +143,18 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
 static unsigned int
 tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
 {
-   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
    unsigned ret_align, divisor, multiplier_ys;
 
    /* Vertical alignment tables for TRMODE_YF */
    const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
    const unsigned align_3d_yf[] = {16, 16, 16, 8, 8};
-   int i = 0;
 
    assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
 
-   /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
+   /* Alignment computations below assume a power of 2 cpp. */
+   assert (mt->cpp >= 1 && mt->cpp <= 16 && _mesa_is_pow_two(mt->cpp)) ;
    /* Compute array index. */
-   i = ffs(bpp / 8) - 1;
+   const int i = ffs(mt->cpp) - 1;
 
    switch(mt->target) {
    case GL_TEXTURE_2D:

From b48e16fa2f8b96bb36a6e0a92b6d842c1c246006 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Sat, 17 Oct 2015 12:07:32 -0600
Subject: [PATCH 44/85] draw: fix splitting of line loops (v2)

When the draw module splits long line loops, the sections are emitted
as line strips.  But the primitive type wasn't set correctly so each
section was being drawn as a loop, introducing extra line segments.

To fix this, we pass a new DRAW_LINE_LOOP_AS_STRIP flag to the run()
function.  The linear/elt_run() functions have to check for this flag
and set their primitive type accordingly.

No piglit regressions.  Fixes piglit's lineloop with -count 4097 or
higher.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=81174

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/draw/draw_private.h        |  5 +++--
 .../draw/draw_pt_fetch_shade_pipeline.c          | 16 +++++++++++++---
 .../draw/draw_pt_fetch_shade_pipeline_llvm.c     | 16 +++++++++++++---
 src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h  |  3 +++
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 0ad94bb031f..5584c4a222c 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -355,8 +355,9 @@ struct draw_vertex_info {
 };
 
 /* these flags are set if the primitive is a segment of a larger one */
-#define DRAW_SPLIT_BEFORE 0x1
-#define DRAW_SPLIT_AFTER  0x2
+#define DRAW_SPLIT_BEFORE        0x1
+#define DRAW_SPLIT_AFTER         0x2
+#define DRAW_LINE_LOOP_AS_STRIP  0x4
 
 struct draw_prim_info {
    boolean linear;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index ffec863ae6f..aa20b918f50 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -359,6 +359,16 @@ fetch_pipeline_generic(struct draw_pt_middle_end *middle,
 }
 
 
+static inline unsigned
+prim_type(unsigned prim, unsigned flags)
+{
+   if (flags & DRAW_LINE_LOOP_AS_STRIP)
+      return PIPE_PRIM_LINE_STRIP;
+   else
+      return prim;
+}
+
+
 static void
 fetch_pipeline_run(struct draw_pt_middle_end *middle,
                    const unsigned *fetch_elts,
@@ -380,7 +390,7 @@ fetch_pipeline_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
@@ -408,7 +418,7 @@ fetch_pipeline_linear_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = count;
    prim_info.elts = NULL;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
@@ -439,7 +449,7 @@ fetch_pipeline_linear_run_elts(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index e42c4af0e70..2d7569b0fdf 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -473,6 +473,16 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle,
 }
 
 
+static inline unsigned
+prim_type(unsigned prim, unsigned flags)
+{
+   if (flags & DRAW_LINE_LOOP_AS_STRIP)
+      return PIPE_PRIM_LINE_STRIP;
+   else
+      return prim;
+}
+
+
 static void
 llvm_middle_end_run(struct draw_pt_middle_end *middle,
                     const unsigned *fetch_elts,
@@ -494,7 +504,7 @@ llvm_middle_end_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
@@ -522,7 +532,7 @@ llvm_middle_end_linear_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = count;
    prim_info.elts = NULL;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
@@ -552,7 +562,7 @@ llvm_middle_end_linear_run_elts(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
index 0afabb01398..6da79b9490b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -249,6 +249,9 @@ vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags,
 
    assert(icount + !!close_loop <= vsplit->segment_size);
 
+   /* need to draw the sections of the line loop as line strips */
+   flags |= DRAW_LINE_LOOP_AS_STRIP;
+
    if (close_loop) {
       for (nr = 0; nr < icount; nr++)
          vsplit->fetch_elts[nr] = istart + nr;

From f1682fdafa54bb2e710707c441ed652358e57502 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 20 Oct 2015 18:22:43 -0600
Subject: [PATCH 45/85] svga: add switch case for
 PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT

A third instance of this was needed but missed in the previous commit.
Return 32 as for the two other cases.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_screen.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 17b042e7d95..f6fafca5c0b 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -605,6 +605,8 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    default:
       debug_printf("Unexpected vgpu10 shader query %u\n", param);
       return 0;

From 96bbb3707f402149ae48bc3991febeed86c4fa21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 20 Oct 2015 08:32:15 +0300
Subject: [PATCH 46/85] glsl: skip buffer variables when filling
 UniformRemapTable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UniformRemapTable is used only for remapping user specified uniform
locations to driver internally used ones, shader storage buffer
variables should not utilize uniform locations.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/link_uniforms.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index fe00aa30d07..f7b87a1811a 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -1180,7 +1180,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    /* Reserve all the explicit locations of the active uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
-      if (uniforms[i].type->is_subroutine())
+      if (uniforms[i].type->is_subroutine() ||
+          uniforms[i].is_shader_storage)
          continue;
 
       if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC) {
@@ -1200,8 +1201,10 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    /* Reserve locations for rest of the uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
 
-      if (uniforms[i].type->is_subroutine())
+      if (uniforms[i].type->is_subroutine() ||
+          uniforms[i].is_shader_storage)
          continue;
+
       /* Built-in uniforms should not get any location. */
       if (uniforms[i].builtin)
          continue;

From 1f48ea1193e3659c3f94b5de31d9200c1d500e72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 20 Oct 2015 10:24:50 +0300
Subject: [PATCH 47/85] glsl: do not try to reserve explicit locations for
 buffer variables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Explicit locations are only used with uniform variables.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/linker.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 247052bcf4f..07ea0e0c7e5 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3114,8 +3114,8 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
       foreach_in_list(ir_instruction, node, sh->ir) {
          ir_variable *var = node->as_variable();
-         if (var && (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage) &&
-             var->data.explicit_location) {
+         if (var && (var->data.mode == ir_var_uniform &&
+                     var->data.explicit_location)) {
             bool ret;
             if (var->type->is_subroutine())
                ret = reserve_subroutine_explicit_locations(prog, sh, var);

From a59c1adcc665b70ca5a8fbfebe3f0d6e05ad2778 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Tue, 20 Oct 2015 12:18:51 +0300
Subject: [PATCH 48/85] glsl: fix record type detection in explicit location
 assign
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check current_var directly instead of using the passed in record_type.

This fixes following failing CTS test:
	ES31-CTS.explicit_uniform_location.uniform-loc-types-structs

No Piglit regressions.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/link_uniforms.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index f7b87a1811a..6efde5c27f2 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -763,7 +763,7 @@ private:
       /* Assign explicit locations. */
       if (current_var->data.explicit_location) {
          /* Set sequential locations for struct fields. */
-         if (record_type != NULL) {
+         if (current_var->type->without_array()->is_record()) {
             const unsigned entries = MAX2(1, this->uniforms[id].array_elements);
             this->uniforms[id].remap_location =
                this->explicit_location + field_counter;

From fd01840c0bd3b22d058a65a17ad30e3b45813b60 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Wed, 5 Aug 2015 15:49:22 +1000
Subject: [PATCH 49/85] glsl: add AoA support to subroutines

process_parameters() will now be called earlier because we need
actual_parameters processed earlier so we can use it with
match_subroutine_by_name() to get the subroutine variable, we need
to do this inside the recursive function generate_array_index() because
we can't create the ir_dereference_array() until we have gotten to the
outermost array.

For the remainder of the array dimensions the type doesn't matter so we
can just use the existing _mesa_ast_array_index_to_hir() function to
process the ast.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/ast_function.cpp     | 43 +++++++++++++++++++++++++++++++----
 src/glsl/lower_subroutine.cpp |  2 +-
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index c5c5cae333b..e4e4a3fe148 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -610,6 +610,37 @@ match_subroutine_by_name(const char *name,
    return sig;
 }
 
+static ir_rvalue *
+generate_array_index(void *mem_ctx, exec_list *instructions,
+                     struct _mesa_glsl_parse_state *state, YYLTYPE loc,
+                     const ast_expression *array, ast_expression *idx,
+                     const char **function_name, exec_list *actual_parameters)
+{
+   if (array->oper == ast_array_index) {
+      /* This handles arrays of arrays */
+      ir_rvalue *outer_array = generate_array_index(mem_ctx, instructions,
+                                                    state, loc,
+                                                    array->subexpressions[0],
+                                                    array->subexpressions[1],
+                                                    function_name, actual_parameters);
+      ir_rvalue *outer_array_idx = idx->hir(instructions, state);
+
+      YYLTYPE index_loc = idx->get_location();
+      return _mesa_ast_array_index_to_hir(mem_ctx, state, outer_array,
+                                          outer_array_idx, loc,
+                                          index_loc);
+   } else {
+      ir_variable *sub_var = NULL;
+      *function_name = array->primary_expression.identifier;
+
+      match_subroutine_by_name(*function_name, actual_parameters,
+                               state, &sub_var);
+
+      ir_rvalue *outer_array_idx = idx->hir(instructions, state);
+      return new(mem_ctx) ir_dereference_array(sub_var, outer_array_idx);
+   }
+}
+
 static void
 print_function_prototypes(_mesa_glsl_parse_state *state, YYLTYPE *loc,
                           ir_function *f)
@@ -1989,16 +2020,18 @@ ast_function_expression::hir(exec_list *instructions,
       ir_variable *sub_var = NULL;
       ir_rvalue *array_idx = NULL;
 
+      process_parameters(instructions, &actual_parameters, &this->expressions,
+			 state);
+
       if (id->oper == ast_array_index) {
-         func_name = id->subexpressions[0]->primary_expression.identifier;
-	 array_idx = id->subexpressions[1]->hir(instructions, state);
+         array_idx = generate_array_index(ctx, instructions, state, loc,
+                                          id->subexpressions[0],
+                                          id->subexpressions[1], &func_name,
+                                          &actual_parameters);
       } else {
          func_name = id->primary_expression.identifier;
       }
 
-      process_parameters(instructions, &actual_parameters, &this->expressions,
-			 state);
-
       ir_function_signature *sig =
 	 match_function_by_name(func_name, &actual_parameters, state);
 
diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
index c1aed61a36a..a0df5e1df81 100644
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@@ -84,7 +84,7 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
          continue;
 
       if (ir->array_idx != NULL)
-         var = new(mem_ctx) ir_dereference_array(ir->sub_var, ir->array_idx->clone(mem_ctx, NULL));
+         var = ir->array_idx->clone(mem_ctx, NULL);
       else
          var = new(mem_ctx) ir_dereference_variable(ir->sub_var);
 

From 156b7d3113757eb437dfcfa3ca7ef9b03f3097b2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 20 Oct 2015 19:51:56 -0700
Subject: [PATCH 50/85] glsl: Fix bad indentation in bit_logic_result_type().

The first level of indentation was using 4 spaces.  Mesa uses 3.

Trivial.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/ast_to_hir.cpp | 86 ++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index db9229f6ae3..8549d55c4e3 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -487,54 +487,54 @@ bit_logic_result_type(const struct glsl_type *type_a,
                       ast_operators op,
                       struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
 {
-    if (!state->check_bitwise_operations_allowed(loc)) {
-       return glsl_type::error_type;
-    }
+   if (!state->check_bitwise_operations_allowed(loc)) {
+      return glsl_type::error_type;
+   }
 
-    /* From page 50 (page 56 of PDF) of GLSL 1.30 spec:
-     *
-     *     "The bitwise operators and (&), exclusive-or (^), and inclusive-or
-     *     (|). The operands must be of type signed or unsigned integers or
-     *     integer vectors."
-     */
-    if (!type_a->is_integer()) {
-       _mesa_glsl_error(loc, state, "LHS of `%s' must be an integer",
-                         ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
-    if (!type_b->is_integer()) {
-       _mesa_glsl_error(loc, state, "RHS of `%s' must be an integer",
+   /* From page 50 (page 56 of PDF) of GLSL 1.30 spec:
+    *
+    *     "The bitwise operators and (&), exclusive-or (^), and inclusive-or
+    *     (|). The operands must be of type signed or unsigned integers or
+    *     integer vectors."
+    */
+   if (!type_a->is_integer()) {
+      _mesa_glsl_error(loc, state, "LHS of `%s' must be an integer",
                         ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
+      return glsl_type::error_type;
+   }
+   if (!type_b->is_integer()) {
+      _mesa_glsl_error(loc, state, "RHS of `%s' must be an integer",
+                       ast_expression::operator_string(op));
+      return glsl_type::error_type;
+   }
 
-    /*     "The fundamental types of the operands (signed or unsigned) must
-     *     match,"
-     */
-    if (type_a->base_type != type_b->base_type) {
-       _mesa_glsl_error(loc, state, "operands of `%s' must have the same "
-                        "base type", ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
+   /*     "The fundamental types of the operands (signed or unsigned) must
+    *     match,"
+    */
+   if (type_a->base_type != type_b->base_type) {
+      _mesa_glsl_error(loc, state, "operands of `%s' must have the same "
+                       "base type", ast_expression::operator_string(op));
+      return glsl_type::error_type;
+   }
 
-    /*     "The operands cannot be vectors of differing size." */
-    if (type_a->is_vector() &&
-        type_b->is_vector() &&
-        type_a->vector_elements != type_b->vector_elements) {
-       _mesa_glsl_error(loc, state, "operands of `%s' cannot be vectors of "
-                        "different sizes", ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
+   /*     "The operands cannot be vectors of differing size." */
+   if (type_a->is_vector() &&
+       type_b->is_vector() &&
+       type_a->vector_elements != type_b->vector_elements) {
+      _mesa_glsl_error(loc, state, "operands of `%s' cannot be vectors of "
+                       "different sizes", ast_expression::operator_string(op));
+      return glsl_type::error_type;
+   }
 
-    /*     "If one operand is a scalar and the other a vector, the scalar is
-     *     applied component-wise to the vector, resulting in the same type as
-     *     the vector. The fundamental types of the operands [...] will be the
-     *     resulting fundamental type."
-     */
-    if (type_a->is_scalar())
-        return type_b;
-    else
-        return type_a;
+   /*     "If one operand is a scalar and the other a vector, the scalar is
+    *     applied component-wise to the vector, resulting in the same type as
+    *     the vector. The fundamental types of the operands [...] will be the
+    *     resulting fundamental type."
+    */
+   if (type_a->is_scalar())
+       return type_b;
+   else
+       return type_a;
 }
 
 static const struct glsl_type *

From 9a04057ef130e1539aa94babd2e35ce53e6f1e1e Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 21 Oct 2015 13:37:11 +1100
Subject: [PATCH 51/85] glsl: add is_array_of_arrays() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As suggested by Ian Romanick

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/glsl/nir/glsl_types.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h
index 3ec764219de..52ca8260da7 100644
--- a/src/glsl/nir/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
@@ -513,6 +513,11 @@ struct glsl_type {
       return base_type == GLSL_TYPE_ARRAY;
    }
 
+   bool is_array_of_arrays() const
+   {
+      return is_array() && fields.array->is_array();
+   }
+
    /**
     * Query whether or not a type is a record
     */

From 38ceeeadaa2f5f0a21dba9f5339fbc4cba66dece Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 21 Oct 2015 13:44:10 +1100
Subject: [PATCH 52/85] glsl: check for arrays of arrays when assigning
 explicit locations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes assigning explicit locations in the CTS test:

ES31-CTS.explicit_uniform_location.uniform-loc-arrays-of-arrays

Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/glsl/link_uniforms.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 6efde5c27f2..8183e65d2f5 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -763,7 +763,8 @@ private:
       /* Assign explicit locations. */
       if (current_var->data.explicit_location) {
          /* Set sequential locations for struct fields. */
-         if (current_var->type->without_array()->is_record()) {
+         if (current_var->type->without_array()->is_record() ||
+             current_var->type->is_array_of_arrays()) {
             const unsigned entries = MAX2(1, this->uniforms[id].array_elements);
             this->uniforms[id].remap_location =
                this->explicit_location + field_counter;

From 801f151917fedb13c5c6e96281a18d833dd6901f Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Tue, 20 Oct 2015 11:16:00 +0200
Subject: [PATCH 53/85] i965: Remove block arg from
 foreach_inst_in_block_*_starting_from

Since 49374fab5d793 these macros no longer actually use the block
argument. I think this is worth doing to make the macros easier to use
because they already have really long names and a confusing set of
arguments.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_cfg.h                       | 4 ++--
 src/mesa/drivers/dri/i965/brw_fs.cpp                      | 6 +++---
 src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp     | 3 +--
 src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp | 2 +-
 src/mesa/drivers/dri/i965/brw_vec4.cpp                    | 2 +-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index a09491781e6..a06b0aa1cd0 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -327,12 +327,12 @@ struct cfg_t {
 #define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
    foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
 
-#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst, __block) \
+#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
    for (__type *__scan_inst = (__type *)__inst->next;          \
         !__scan_inst->is_tail_sentinel();                      \
         __scan_inst = (__type *)__scan_inst->next)
 
-#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst, __block) \
+#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
    for (__type *__scan_inst = (__type *)__inst->prev;          \
         !__scan_inst->is_head_sentinel();                      \
         __scan_inst = (__type *)__scan_inst->prev)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 49323eb790d..97d7fd76f62 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2456,7 +2456,7 @@ fs_visitor::compute_to_mrf()
       /* Found a move of a GRF to a MRF.  Let's see if we can go
        * rewrite the thing that made this GRF to write into the MRF.
        */
-      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
 	 if (scan_inst->dst.file == GRF &&
 	     scan_inst->dst.reg == inst->src[0].reg) {
 	    /* Found the last thing to write our reg we want to turn
@@ -2789,7 +2789,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
     * we assume that there are no outstanding dependencies on entry to the
     * program.
     */
-   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
       /* If we hit control flow, assume that there *are* outstanding
        * dependencies, and force their cleanup before our instruction.
        */
@@ -2855,7 +2855,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
    /* Walk forwards looking for writes to registers we're writing which aren't
     * read before being written.
     */
-   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
+   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
       /* If we hit control flow, force resolve all remaining dependencies. */
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
index 469f2ea4e16..883e8d2a49f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
@@ -87,8 +87,7 @@ opt_cmod_propagation_local(bblock_t *block)
          continue;
 
       bool read_flag = false;
-      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst,
-                                                  block) {
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (scan_inst->overwrites_reg(inst->src[0])) {
             if (scan_inst->is_partial_write() ||
                 scan_inst->dst.reg_offset != inst->src[0].reg_offset)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
index 8792a8c7b1d..862e3245d43 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
@@ -64,7 +64,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
       int src_end_ip = v->live_intervals->end[src_var];
 
       bool interfered = false;
-      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (scan_inst->overwrites_reg(inst->src[0])) {
             if (scan_inst->is_partial_write() ||
                 (scan_inst->dst.type != inst->dst.type &&
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index befc92445d3..3e7078d0b32 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1111,7 +1111,7 @@ vec4_visitor::opt_register_coalesce()
        */
       vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
-                                                  inst, block) {
+                                                  inst) {
          _scan_inst = scan_inst;
 
          if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {

From ee77796a5c97105bf7e92e3a7931ee0f331a0545 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Tue, 20 Oct 2015 11:56:15 +0200
Subject: [PATCH 54/85] i965/fs: Disable opt_sampler_eot for more message types

In bfdae9149e0 I disabled the opt_sampler_eot optimisation for TG4
message types because I found by experimentation that it doesn't work.
I wrote in the comment that I couldn't find any documentation for this
problem. However I've now found the documentation and it has
additional restrictions on further message types so this patch updates
the comment and adds the others.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 97d7fd76f62..da90467e625 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2238,13 +2238,15 @@ fs_visitor::opt_sampler_eot()
    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
       return false;
 
-   /* This optimisation doesn't seem to work for textureGather for some
-    * reason. I can't find any documentation or known workarounds to indicate
-    * that this is expected, but considering that it is probably pretty
-    * unlikely that a shader would directly write out the results from
-    * textureGather we might as well just disable it.
+   /* 3D Sampler » Messages » Message Format
+    *
+    * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
+    *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
     */
-   if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
+   if (tex_inst->opcode == SHADER_OPCODE_TXS ||
+       tex_inst->opcode == SHADER_OPCODE_SAMPLEINFO ||
+       tex_inst->opcode == SHADER_OPCODE_LOD ||
+       tex_inst->opcode == SHADER_OPCODE_TG4 ||
        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
       return false;
 

From 99c4079c37ac04a0dad4ead3117f786706c80aaf Mon Sep 17 00:00:00 2001
From: Jonathan Gray <jsg@jsg.id.au>
Date: Sat, 10 Oct 2015 17:42:40 +1100
Subject: [PATCH 55/85] configure.ac: ensure RM is set

GNU make predefines RM to rm -f but this is not required by POSIX
so ensure that RM is set.  This fixes "make clean" on OpenBSD.

v2: use AC_CHECK_PROG

Signed-off-by: Jonathan Gray <jsg@jsg.id.au>
CC: "10.6 11.0" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 configure.ac | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configure.ac b/configure.ac
index 0a3329021c0..d3df1955d26 100644
--- a/configure.ac
+++ b/configure.ac
@@ -107,6 +107,8 @@ AC_SYS_LARGEFILE
 LT_PREREQ([2.2])
 LT_INIT([disable-static])
 
+AC_CHECK_PROG(RM, rm, [rm -f])
+
 AX_PROG_BISON([],
               AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.c"],
                     [AC_MSG_ERROR([bison not found - unable to compile glcpp-parse.y])]))

From 04703762e544bc732f6f8b07033221dfbd58159f Mon Sep 17 00:00:00 2001
From: Nigel Stewart <nigels@users.sourceforge.net>
Date: Mon, 12 Oct 2015 21:26:37 +1000
Subject: [PATCH 56/85] osmesa: Expose GL entry points for Windows build via
 DEF file.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92437
CC: "10.6 11.0" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/targets/osmesa/osmesa.def       | 337 ++++++++++++++++++++
 src/gallium/targets/osmesa/osmesa.mingw.def | 337 ++++++++++++++++++++
 2 files changed, 674 insertions(+)

diff --git a/src/gallium/targets/osmesa/osmesa.def b/src/gallium/targets/osmesa/osmesa.def
index e2a31ab5457..e347463de9f 100644
--- a/src/gallium/targets/osmesa/osmesa.def
+++ b/src/gallium/targets/osmesa/osmesa.def
@@ -14,3 +14,340 @@ EXPORTS
 	OSMesaGetProcAddress
 	OSMesaColorClamp
 	OSMesaPostprocess
+	glAccum
+	glAlphaFunc
+	glAreTexturesResident
+	glArrayElement
+	glBegin
+	glBindTexture
+	glBitmap
+	glBlendFunc
+	glCallList
+	glCallLists
+	glClear
+	glClearAccum
+	glClearColor
+	glClearDepth
+	glClearIndex
+	glClearStencil
+	glClipPlane
+	glColor3b
+	glColor3bv
+	glColor3d
+	glColor3dv
+	glColor3f
+	glColor3fv
+	glColor3i
+	glColor3iv
+	glColor3s
+	glColor3sv
+	glColor3ub
+	glColor3ubv
+	glColor3ui
+	glColor3uiv
+	glColor3us
+	glColor3usv
+	glColor4b
+	glColor4bv
+	glColor4d
+	glColor4dv
+	glColor4f
+	glColor4fv
+	glColor4i
+	glColor4iv
+	glColor4s
+	glColor4sv
+	glColor4ub
+	glColor4ubv
+	glColor4ui
+	glColor4uiv
+	glColor4us
+	glColor4usv
+	glColorMask
+	glColorMaterial
+	glColorPointer
+	glCopyPixels
+	glCopyTexImage1D
+	glCopyTexImage2D
+	glCopyTexSubImage1D
+	glCopyTexSubImage2D
+	glCullFace
+;	glDebugEntry
+	glDeleteLists
+	glDeleteTextures
+	glDepthFunc
+	glDepthMask
+	glDepthRange
+	glDisable
+	glDisableClientState
+	glDrawArrays
+	glDrawBuffer
+	glDrawElements
+	glDrawPixels
+	glEdgeFlag
+	glEdgeFlagPointer
+	glEdgeFlagv
+	glEnable
+	glEnableClientState
+	glEnd
+	glEndList
+	glEvalCoord1d
+	glEvalCoord1dv
+	glEvalCoord1f
+	glEvalCoord1fv
+	glEvalCoord2d
+	glEvalCoord2dv
+	glEvalCoord2f
+	glEvalCoord2fv
+	glEvalMesh1
+	glEvalMesh2
+	glEvalPoint1
+	glEvalPoint2
+	glFeedbackBuffer
+	glFinish
+	glFlush
+	glFogf
+	glFogfv
+	glFogi
+	glFogiv
+	glFrontFace
+	glFrustum
+	glGenLists
+	glGenTextures
+	glGetBooleanv
+	glGetClipPlane
+	glGetDoublev
+	glGetError
+	glGetFloatv
+	glGetIntegerv
+	glGetLightfv
+	glGetLightiv
+	glGetMapdv
+	glGetMapfv
+	glGetMapiv
+	glGetMaterialfv
+	glGetMaterialiv
+	glGetPixelMapfv
+	glGetPixelMapuiv
+	glGetPixelMapusv
+	glGetPointerv
+	glGetPolygonStipple
+	glGetString
+	glGetTexEnvfv
+	glGetTexEnviv
+	glGetTexGendv
+	glGetTexGenfv
+	glGetTexGeniv
+	glGetTexImage
+	glGetTexLevelParameterfv
+	glGetTexLevelParameteriv
+	glGetTexParameterfv
+	glGetTexParameteriv
+	glHint
+	glIndexMask
+	glIndexPointer
+	glIndexd
+	glIndexdv
+	glIndexf
+	glIndexfv
+	glIndexi
+	glIndexiv
+	glIndexs
+	glIndexsv
+	glIndexub
+	glIndexubv
+	glInitNames
+	glInterleavedArrays
+	glIsEnabled
+	glIsList
+	glIsTexture
+	glLightModelf
+	glLightModelfv
+	glLightModeli
+	glLightModeliv
+	glLightf
+	glLightfv
+	glLighti
+	glLightiv
+	glLineStipple
+	glLineWidth
+	glListBase
+	glLoadIdentity
+	glLoadMatrixd
+	glLoadMatrixf
+	glLoadName
+	glLogicOp
+	glMap1d
+	glMap1f
+	glMap2d
+	glMap2f
+	glMapGrid1d
+	glMapGrid1f
+	glMapGrid2d
+	glMapGrid2f
+	glMaterialf
+	glMaterialfv
+	glMateriali
+	glMaterialiv
+	glMatrixMode
+	glMultMatrixd
+	glMultMatrixf
+	glNewList
+	glNormal3b
+	glNormal3bv
+	glNormal3d
+	glNormal3dv
+	glNormal3f
+	glNormal3fv
+	glNormal3i
+	glNormal3iv
+	glNormal3s
+	glNormal3sv
+	glNormalPointer
+	glOrtho
+	glPassThrough
+	glPixelMapfv
+	glPixelMapuiv
+	glPixelMapusv
+	glPixelStoref
+	glPixelStorei
+	glPixelTransferf
+	glPixelTransferi
+	glPixelZoom
+	glPointSize
+	glPolygonMode
+	glPolygonOffset
+	glPolygonStipple
+	glPopAttrib
+	glPopClientAttrib
+	glPopMatrix
+	glPopName
+	glPrioritizeTextures
+	glPushAttrib
+	glPushClientAttrib
+	glPushMatrix
+	glPushName
+	glRasterPos2d
+	glRasterPos2dv
+	glRasterPos2f
+	glRasterPos2fv
+	glRasterPos2i
+	glRasterPos2iv
+	glRasterPos2s
+	glRasterPos2sv
+	glRasterPos3d
+	glRasterPos3dv
+	glRasterPos3f
+	glRasterPos3fv
+	glRasterPos3i
+	glRasterPos3iv
+	glRasterPos3s
+	glRasterPos3sv
+	glRasterPos4d
+	glRasterPos4dv
+	glRasterPos4f
+	glRasterPos4fv
+	glRasterPos4i
+	glRasterPos4iv
+	glRasterPos4s
+	glRasterPos4sv
+	glReadBuffer
+	glReadPixels
+	glRectd
+	glRectdv
+	glRectf
+	glRectfv
+	glRecti
+	glRectiv
+	glRects
+	glRectsv
+	glRenderMode
+	glRotated
+	glRotatef
+	glScaled
+	glScalef
+	glScissor
+	glSelectBuffer
+	glShadeModel
+	glStencilFunc
+	glStencilMask
+	glStencilOp
+	glTexCoord1d
+	glTexCoord1dv
+	glTexCoord1f
+	glTexCoord1fv
+	glTexCoord1i
+	glTexCoord1iv
+	glTexCoord1s
+	glTexCoord1sv
+	glTexCoord2d
+	glTexCoord2dv
+	glTexCoord2f
+	glTexCoord2fv
+	glTexCoord2i
+	glTexCoord2iv
+	glTexCoord2s
+	glTexCoord2sv
+	glTexCoord3d
+	glTexCoord3dv
+	glTexCoord3f
+	glTexCoord3fv
+	glTexCoord3i
+	glTexCoord3iv
+	glTexCoord3s
+	glTexCoord3sv
+	glTexCoord4d
+	glTexCoord4dv
+	glTexCoord4f
+	glTexCoord4fv
+	glTexCoord4i
+	glTexCoord4iv
+	glTexCoord4s
+	glTexCoord4sv
+	glTexCoordPointer
+	glTexEnvf
+	glTexEnvfv
+	glTexEnvi
+	glTexEnviv
+	glTexGend
+	glTexGendv
+	glTexGenf
+	glTexGenfv
+	glTexGeni
+	glTexGeniv
+	glTexImage1D
+	glTexImage2D
+	glTexParameterf
+	glTexParameterfv
+	glTexParameteri
+	glTexParameteriv
+	glTexSubImage1D
+	glTexSubImage2D
+	glTranslated
+	glTranslatef
+	glVertex2d
+	glVertex2dv
+	glVertex2f
+	glVertex2fv
+	glVertex2i
+	glVertex2iv
+	glVertex2s
+	glVertex2sv
+	glVertex3d
+	glVertex3dv
+	glVertex3f
+	glVertex3fv
+	glVertex3i
+	glVertex3iv
+	glVertex3s
+	glVertex3sv
+	glVertex4d
+	glVertex4dv
+	glVertex4f
+	glVertex4fv
+	glVertex4i
+	glVertex4iv
+	glVertex4s
+	glVertex4sv
+	glVertexPointer
+	glViewport
diff --git a/src/gallium/targets/osmesa/osmesa.mingw.def b/src/gallium/targets/osmesa/osmesa.mingw.def
index 874ac544084..945201c9d83 100644
--- a/src/gallium/targets/osmesa/osmesa.mingw.def
+++ b/src/gallium/targets/osmesa/osmesa.mingw.def
@@ -11,3 +11,340 @@ EXPORTS
 	OSMesaGetProcAddress = OSMesaGetProcAddress@4
 	OSMesaColorClamp = OSMesaColorClamp@4
 	OSMesaPostprocess = OSMesaPostprocess@12
+	glAccum = glAccum@8
+	glAlphaFunc = glAlphaFunc@8
+	glAreTexturesResident = glAreTexturesResident@12
+	glArrayElement = glArrayElement@4
+	glBegin = glBegin@4
+	glBindTexture = glBindTexture@8
+	glBitmap = glBitmap@28
+	glBlendFunc = glBlendFunc@8
+	glCallList = glCallList@4
+	glCallLists = glCallLists@12
+	glClear = glClear@4
+	glClearAccum = glClearAccum@16
+	glClearColor = glClearColor@16
+	glClearDepth = glClearDepth@8
+	glClearIndex = glClearIndex@4
+	glClearStencil = glClearStencil@4
+	glClipPlane = glClipPlane@8
+	glColor3b = glColor3b@12
+	glColor3bv = glColor3bv@4
+	glColor3d = glColor3d@24
+	glColor3dv = glColor3dv@4
+	glColor3f = glColor3f@12
+	glColor3fv = glColor3fv@4
+	glColor3i = glColor3i@12
+	glColor3iv = glColor3iv@4
+	glColor3s = glColor3s@12
+	glColor3sv = glColor3sv@4
+	glColor3ub = glColor3ub@12
+	glColor3ubv = glColor3ubv@4
+	glColor3ui = glColor3ui@12
+	glColor3uiv = glColor3uiv@4
+	glColor3us = glColor3us@12
+	glColor3usv = glColor3usv@4
+	glColor4b = glColor4b@16
+	glColor4bv = glColor4bv@4
+	glColor4d = glColor4d@32
+	glColor4dv = glColor4dv@4
+	glColor4f = glColor4f@16
+	glColor4fv = glColor4fv@4
+	glColor4i = glColor4i@16
+	glColor4iv = glColor4iv@4
+	glColor4s = glColor4s@16
+	glColor4sv = glColor4sv@4
+	glColor4ub = glColor4ub@16
+	glColor4ubv = glColor4ubv@4
+	glColor4ui = glColor4ui@16
+	glColor4uiv = glColor4uiv@4
+	glColor4us = glColor4us@16
+	glColor4usv = glColor4usv@4
+	glColorMask = glColorMask@16
+	glColorMaterial = glColorMaterial@8
+	glColorPointer = glColorPointer@16
+	glCopyPixels = glCopyPixels@20
+	glCopyTexImage1D = glCopyTexImage1D@28
+	glCopyTexImage2D = glCopyTexImage2D@32
+	glCopyTexSubImage1D = glCopyTexSubImage1D@24
+	glCopyTexSubImage2D = glCopyTexSubImage2D@32
+	glCullFace = glCullFace@4
+;	glDebugEntry = glDebugEntry@8
+	glDeleteLists = glDeleteLists@8
+	glDeleteTextures = glDeleteTextures@8
+	glDepthFunc = glDepthFunc@4
+	glDepthMask = glDepthMask@4
+	glDepthRange = glDepthRange@16
+	glDisable = glDisable@4
+	glDisableClientState = glDisableClientState@4
+	glDrawArrays = glDrawArrays@12
+	glDrawBuffer = glDrawBuffer@4
+	glDrawElements = glDrawElements@16
+	glDrawPixels = glDrawPixels@20
+	glEdgeFlag = glEdgeFlag@4
+	glEdgeFlagPointer = glEdgeFlagPointer@8
+	glEdgeFlagv = glEdgeFlagv@4
+	glEnable = glEnable@4
+	glEnableClientState = glEnableClientState@4
+	glEnd = glEnd@0
+	glEndList = glEndList@0
+	glEvalCoord1d = glEvalCoord1d@8
+	glEvalCoord1dv = glEvalCoord1dv@4
+	glEvalCoord1f = glEvalCoord1f@4
+	glEvalCoord1fv = glEvalCoord1fv@4
+	glEvalCoord2d = glEvalCoord2d@16
+	glEvalCoord2dv = glEvalCoord2dv@4
+	glEvalCoord2f = glEvalCoord2f@8
+	glEvalCoord2fv = glEvalCoord2fv@4
+	glEvalMesh1 = glEvalMesh1@12
+	glEvalMesh2 = glEvalMesh2@20
+	glEvalPoint1 = glEvalPoint1@4
+	glEvalPoint2 = glEvalPoint2@8
+	glFeedbackBuffer = glFeedbackBuffer@12
+	glFinish = glFinish@0
+	glFlush = glFlush@0
+	glFogf = glFogf@8
+	glFogfv = glFogfv@8
+	glFogi = glFogi@8
+	glFogiv = glFogiv@8
+	glFrontFace = glFrontFace@4
+	glFrustum = glFrustum@48
+	glGenLists = glGenLists@4
+	glGenTextures = glGenTextures@8
+	glGetBooleanv = glGetBooleanv@8
+	glGetClipPlane = glGetClipPlane@8
+	glGetDoublev = glGetDoublev@8
+	glGetError = glGetError@0
+	glGetFloatv = glGetFloatv@8
+	glGetIntegerv = glGetIntegerv@8
+	glGetLightfv = glGetLightfv@12
+	glGetLightiv = glGetLightiv@12
+	glGetMapdv = glGetMapdv@12
+	glGetMapfv = glGetMapfv@12
+	glGetMapiv = glGetMapiv@12
+	glGetMaterialfv = glGetMaterialfv@12
+	glGetMaterialiv = glGetMaterialiv@12
+	glGetPixelMapfv = glGetPixelMapfv@8
+	glGetPixelMapuiv = glGetPixelMapuiv@8
+	glGetPixelMapusv = glGetPixelMapusv@8
+	glGetPointerv = glGetPointerv@8
+	glGetPolygonStipple = glGetPolygonStipple@4
+	glGetString = glGetString@4
+	glGetTexEnvfv = glGetTexEnvfv@12
+	glGetTexEnviv = glGetTexEnviv@12
+	glGetTexGendv = glGetTexGendv@12
+	glGetTexGenfv = glGetTexGenfv@12
+	glGetTexGeniv = glGetTexGeniv@12
+	glGetTexImage = glGetTexImage@20
+	glGetTexLevelParameterfv = glGetTexLevelParameterfv@16
+	glGetTexLevelParameteriv = glGetTexLevelParameteriv@16
+	glGetTexParameterfv = glGetTexParameterfv@12
+	glGetTexParameteriv = glGetTexParameteriv@12
+	glHint = glHint@8
+	glIndexMask = glIndexMask@4
+	glIndexPointer = glIndexPointer@12
+	glIndexd = glIndexd@8
+	glIndexdv = glIndexdv@4
+	glIndexf = glIndexf@4
+	glIndexfv = glIndexfv@4
+	glIndexi = glIndexi@4
+	glIndexiv = glIndexiv@4
+	glIndexs = glIndexs@4
+	glIndexsv = glIndexsv@4
+	glIndexub = glIndexub@4
+	glIndexubv = glIndexubv@4
+	glInitNames = glInitNames@0
+	glInterleavedArrays = glInterleavedArrays@12
+	glIsEnabled = glIsEnabled@4
+	glIsList = glIsList@4
+	glIsTexture = glIsTexture@4
+	glLightModelf = glLightModelf@8
+	glLightModelfv = glLightModelfv@8
+	glLightModeli = glLightModeli@8
+	glLightModeliv = glLightModeliv@8
+	glLightf = glLightf@12
+	glLightfv = glLightfv@12
+	glLighti = glLighti@12
+	glLightiv = glLightiv@12
+	glLineStipple = glLineStipple@8
+	glLineWidth = glLineWidth@4
+	glListBase = glListBase@4
+	glLoadIdentity = glLoadIdentity@0
+	glLoadMatrixd = glLoadMatrixd@4
+	glLoadMatrixf = glLoadMatrixf@4
+	glLoadName = glLoadName@4
+	glLogicOp = glLogicOp@4
+	glMap1d = glMap1d@32
+	glMap1f = glMap1f@24
+	glMap2d = glMap2d@56
+	glMap2f = glMap2f@40
+	glMapGrid1d = glMapGrid1d@20
+	glMapGrid1f = glMapGrid1f@12
+	glMapGrid2d = glMapGrid2d@40
+	glMapGrid2f = glMapGrid2f@24
+	glMaterialf = glMaterialf@12
+	glMaterialfv = glMaterialfv@12
+	glMateriali = glMateriali@12
+	glMaterialiv = glMaterialiv@12
+	glMatrixMode = glMatrixMode@4
+	glMultMatrixd = glMultMatrixd@4
+	glMultMatrixf = glMultMatrixf@4
+	glNewList = glNewList@8
+	glNormal3b = glNormal3b@12
+	glNormal3bv = glNormal3bv@4
+	glNormal3d = glNormal3d@24
+	glNormal3dv = glNormal3dv@4
+	glNormal3f = glNormal3f@12
+	glNormal3fv = glNormal3fv@4
+	glNormal3i = glNormal3i@12
+	glNormal3iv = glNormal3iv@4
+	glNormal3s = glNormal3s@12
+	glNormal3sv = glNormal3sv@4
+	glNormalPointer = glNormalPointer@12
+	glOrtho = glOrtho@48
+	glPassThrough = glPassThrough@4
+	glPixelMapfv = glPixelMapfv@12
+	glPixelMapuiv = glPixelMapuiv@12
+	glPixelMapusv = glPixelMapusv@12
+	glPixelStoref = glPixelStoref@8
+	glPixelStorei = glPixelStorei@8
+	glPixelTransferf = glPixelTransferf@8
+	glPixelTransferi = glPixelTransferi@8
+	glPixelZoom = glPixelZoom@8
+	glPointSize = glPointSize@4
+	glPolygonMode = glPolygonMode@8
+	glPolygonOffset = glPolygonOffset@8
+	glPolygonStipple = glPolygonStipple@4
+	glPopAttrib = glPopAttrib@0
+	glPopClientAttrib = glPopClientAttrib@0
+	glPopMatrix = glPopMatrix@0
+	glPopName = glPopName@0
+	glPrioritizeTextures = glPrioritizeTextures@12
+	glPushAttrib = glPushAttrib@4
+	glPushClientAttrib = glPushClientAttrib@4
+	glPushMatrix = glPushMatrix@0
+	glPushName = glPushName@4
+	glRasterPos2d = glRasterPos2d@16
+	glRasterPos2dv = glRasterPos2dv@4
+	glRasterPos2f = glRasterPos2f@8
+	glRasterPos2fv = glRasterPos2fv@4
+	glRasterPos2i = glRasterPos2i@8
+	glRasterPos2iv = glRasterPos2iv@4
+	glRasterPos2s = glRasterPos2s@8
+	glRasterPos2sv = glRasterPos2sv@4
+	glRasterPos3d = glRasterPos3d@24
+	glRasterPos3dv = glRasterPos3dv@4
+	glRasterPos3f = glRasterPos3f@12
+	glRasterPos3fv = glRasterPos3fv@4
+	glRasterPos3i = glRasterPos3i@12
+	glRasterPos3iv = glRasterPos3iv@4
+	glRasterPos3s = glRasterPos3s@12
+	glRasterPos3sv = glRasterPos3sv@4
+	glRasterPos4d = glRasterPos4d@32
+	glRasterPos4dv = glRasterPos4dv@4
+	glRasterPos4f = glRasterPos4f@16
+	glRasterPos4fv = glRasterPos4fv@4
+	glRasterPos4i = glRasterPos4i@16
+	glRasterPos4iv = glRasterPos4iv@4
+	glRasterPos4s = glRasterPos4s@16
+	glRasterPos4sv = glRasterPos4sv@4
+	glReadBuffer = glReadBuffer@4
+	glReadPixels = glReadPixels@28
+	glRectd = glRectd@32
+	glRectdv = glRectdv@8
+	glRectf = glRectf@16
+	glRectfv = glRectfv@8
+	glRecti = glRecti@16
+	glRectiv = glRectiv@8
+	glRects = glRects@16
+	glRectsv = glRectsv@8
+	glRenderMode = glRenderMode@4
+	glRotated = glRotated@32
+	glRotatef = glRotatef@16
+	glScaled = glScaled@24
+	glScalef = glScalef@12
+	glScissor = glScissor@16
+	glSelectBuffer = glSelectBuffer@8
+	glShadeModel = glShadeModel@4
+	glStencilFunc = glStencilFunc@12
+	glStencilMask = glStencilMask@4
+	glStencilOp = glStencilOp@12
+	glTexCoord1d = glTexCoord1d@8
+	glTexCoord1dv = glTexCoord1dv@4
+	glTexCoord1f = glTexCoord1f@4
+	glTexCoord1fv = glTexCoord1fv@4
+	glTexCoord1i = glTexCoord1i@4
+	glTexCoord1iv = glTexCoord1iv@4
+	glTexCoord1s = glTexCoord1s@4
+	glTexCoord1sv = glTexCoord1sv@4
+	glTexCoord2d = glTexCoord2d@16
+	glTexCoord2dv = glTexCoord2dv@4
+	glTexCoord2f = glTexCoord2f@8
+	glTexCoord2fv = glTexCoord2fv@4
+	glTexCoord2i = glTexCoord2i@8
+	glTexCoord2iv = glTexCoord2iv@4
+	glTexCoord2s = glTexCoord2s@8
+	glTexCoord2sv = glTexCoord2sv@4
+	glTexCoord3d = glTexCoord3d@24
+	glTexCoord3dv = glTexCoord3dv@4
+	glTexCoord3f = glTexCoord3f@12
+	glTexCoord3fv = glTexCoord3fv@4
+	glTexCoord3i = glTexCoord3i@12
+	glTexCoord3iv = glTexCoord3iv@4
+	glTexCoord3s = glTexCoord3s@12
+	glTexCoord3sv = glTexCoord3sv@4
+	glTexCoord4d = glTexCoord4d@32
+	glTexCoord4dv = glTexCoord4dv@4
+	glTexCoord4f = glTexCoord4f@16
+	glTexCoord4fv = glTexCoord4fv@4
+	glTexCoord4i = glTexCoord4i@16
+	glTexCoord4iv = glTexCoord4iv@4
+	glTexCoord4s = glTexCoord4s@16
+	glTexCoord4sv = glTexCoord4sv@4
+	glTexCoordPointer = glTexCoordPointer@16
+	glTexEnvf = glTexEnvf@12
+	glTexEnvfv = glTexEnvfv@12
+	glTexEnvi = glTexEnvi@12
+	glTexEnviv = glTexEnviv@12
+	glTexGend = glTexGend@16
+	glTexGendv = glTexGendv@12
+	glTexGenf = glTexGenf@12
+	glTexGenfv = glTexGenfv@12
+	glTexGeni = glTexGeni@12
+	glTexGeniv = glTexGeniv@12
+	glTexImage1D = glTexImage1D@32
+	glTexImage2D = glTexImage2D@36
+	glTexParameterf = glTexParameterf@12
+	glTexParameterfv = glTexParameterfv@12
+	glTexParameteri = glTexParameteri@12
+	glTexParameteriv = glTexParameteriv@12
+	glTexSubImage1D = glTexSubImage1D@28
+	glTexSubImage2D = glTexSubImage2D@36
+	glTranslated = glTranslated@24
+	glTranslatef = glTranslatef@12
+	glVertex2d = glVertex2d@16
+	glVertex2dv = glVertex2dv@4
+	glVertex2f = glVertex2f@8
+	glVertex2fv = glVertex2fv@4
+	glVertex2i = glVertex2i@8
+	glVertex2iv = glVertex2iv@4
+	glVertex2s = glVertex2s@8
+	glVertex2sv = glVertex2sv@4
+	glVertex3d = glVertex3d@24
+	glVertex3dv = glVertex3dv@4
+	glVertex3f = glVertex3f@12
+	glVertex3fv = glVertex3fv@4
+	glVertex3i = glVertex3i@12
+	glVertex3iv = glVertex3iv@4
+	glVertex3s = glVertex3s@12
+	glVertex3sv = glVertex3sv@4
+	glVertex4d = glVertex4d@32
+	glVertex4dv = glVertex4dv@4
+	glVertex4f = glVertex4f@16
+	glVertex4fv = glVertex4fv@4
+	glVertex4i = glVertex4i@16
+	glVertex4iv = glVertex4iv@4
+	glVertex4s = glVertex4s@16
+	glVertex4sv = glVertex4sv@4
+	glVertexPointer = glVertexPointer@16
+	glViewport = glViewport@16

From 4a132349c333aba9f4dc264d35d5b366ed5e3759 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 15 Oct 2015 15:13:12 -0700
Subject: [PATCH 57/85] i965/vec4: Don't emit MOVs for unused URB slots.

Otherwise we'd emit a MOV from the null register (which isn't allowed).

Helps 24 programs in shader-db (the geometry shaders in GSCloth):

instructions in affected programs:     302 -> 262 (-13.25%)

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 18 +++++++++++++-----
 .../drivers/dri/i965/brw_vec4_vs_visitor.cpp   |  2 +-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index f891910ae60..c39f97e3962 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1221,6 +1221,9 @@ vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
 void
 vec4_visitor::emit_ndc_computation()
 {
+   if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
+      return;
+
    /* Get the position */
    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
 
@@ -1286,7 +1289,8 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (devinfo->has_negative_rhw_bug) {
+      if (devinfo->has_negative_rhw_bug &&
+          output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
@@ -1334,8 +1338,10 @@ vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
    assert(varying < VARYING_SLOT_MAX);
    assert(output_reg[varying].type == reg.type);
    current_annotation = output_reg_annotation[varying];
-   /* Copy the register, saturating if necessary */
-   return emit(MOV(reg, src_reg(output_reg[varying])));
+   if (output_reg[varying].file != BAD_FILE)
+      return emit(MOV(reg, src_reg(output_reg[varying])));
+   else
+      return NULL;
 }
 
 void
@@ -1354,11 +1360,13 @@ vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
    }
    case BRW_VARYING_SLOT_NDC:
       current_annotation = "NDC";
-      emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
+      if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
       break;
    case VARYING_SLOT_POS:
       current_annotation = "gl_Position";
-      emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
+      if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
       break;
    case VARYING_SLOT_EDGE:
       /* This is present when doing unfilled polygons.  We're supposed to copy
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 485a80ee2fc..5dd4f98cecc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -217,7 +217,7 @@ vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
        * shader.
        */
       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
-      if (key->clamp_vertex_color)
+      if (inst && key->clamp_vertex_color)
          inst->saturate = true;
       break;
    }

From ee868c46e80d5a394e14846552bcb2fd77df419c Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 14:41:36 -0700
Subject: [PATCH 58/85] i965: Add devinfo parameter to brw_compact_inst_*
 funcs.

The next commit will add assertions dependent on devinfo->gen.

Use compact()/uncompact() macros where possible, like the 3-src code
does.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_eu_compact.c | 118 +++++++++++++--------
 src/mesa/drivers/dri/i965/brw_inst.h       |  30 ++++--
 2 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index b798931140f..facf3cd1b88 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -690,7 +690,7 @@ set_control_index(const struct brw_device_info *devinfo,
 
    for (int i = 0; i < 32; i++) {
       if (control_index_table[i] == uncompacted) {
-         brw_compact_inst_set_control_index(dst, i);
+         brw_compact_inst_set_control_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -711,7 +711,7 @@ set_datatype_index(const struct brw_device_info *devinfo, brw_compact_inst *dst,
 
    for (int i = 0; i < 32; i++) {
       if (datatype_table[i] == uncompacted) {
-         brw_compact_inst_set_datatype_index(dst, i);
+         brw_compact_inst_set_datatype_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -732,7 +732,7 @@ set_subreg_index(const struct brw_device_info *devinfo, brw_compact_inst *dst,
 
    for (int i = 0; i < 32; i++) {
       if (subreg_table[i] == uncompacted) {
-         brw_compact_inst_set_subreg_index(dst, i);
+         brw_compact_inst_set_subreg_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -764,7 +764,7 @@ set_src0_index(const struct brw_device_info *devinfo,
    if (!get_src_index(uncompacted, &compacted))
       return false;
 
-   brw_compact_inst_set_src0_index(dst, compacted);
+   brw_compact_inst_set_src0_index(devinfo, dst, compacted);
 
    return true;
 }
@@ -784,7 +784,7 @@ set_src1_index(const struct brw_device_info *devinfo, brw_compact_inst *dst,
          return false;
    }
 
-   brw_compact_inst_set_src1_index(dst, compacted);
+   brw_compact_inst_set_src1_index(devinfo, dst, compacted);
 
    return true;
 }
@@ -804,7 +804,7 @@ set_3src_control_index(const struct brw_device_info *devinfo,
 
    for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) {
       if (gen8_3src_control_index_table[i] == uncompacted) {
-         brw_compact_inst_set_3src_control_index(dst, i);
+         brw_compact_inst_set_3src_control_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -838,7 +838,7 @@ set_3src_source_index(const struct brw_device_info *devinfo,
 
    for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) {
       if (gen8_3src_source_index_table[i] == uncompacted) {
-         brw_compact_inst_set_3src_source_index(dst, i);
+         brw_compact_inst_set_3src_source_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -909,7 +909,7 @@ brw_try_compact_3src_instruction(const struct brw_device_info *devinfo,
       return false;
 
 #define compact(field) \
-   brw_compact_inst_set_3src_##field(dst, brw_inst_3src_##field(devinfo, src))
+   brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src))
 
    compact(opcode);
 
@@ -921,7 +921,7 @@ brw_try_compact_3src_instruction(const struct brw_device_info *devinfo,
 
    compact(dst_reg_nr);
    compact(src0_rep_ctrl);
-   brw_compact_inst_set_3src_cmpt_control(dst, true);
+   brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true);
    compact(debug_control);
    compact(saturate);
    compact(src1_rep_ctrl);
@@ -1003,36 +1003,47 @@ brw_try_compact_instruction(const struct brw_device_info *devinfo,
 
    memset(&temp, 0, sizeof(temp));
 
-   brw_compact_inst_set_opcode(&temp, brw_inst_opcode(devinfo, src));
-   brw_compact_inst_set_debug_control(&temp, brw_inst_debug_control(devinfo, src));
+#define compact(field) \
+   brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src))
+
+   compact(opcode);
+   compact(debug_control);
+
    if (!set_control_index(devinfo, &temp, src))
       return false;
    if (!set_datatype_index(devinfo, &temp, src))
       return false;
    if (!set_subreg_index(devinfo, &temp, src, is_immediate))
       return false;
-   brw_compact_inst_set_acc_wr_control(&temp,
-                                       brw_inst_acc_wr_control(devinfo, src));
-   brw_compact_inst_set_cond_modifier(&temp,
-                                      brw_inst_cond_modifier(devinfo, src));
+
+   compact(acc_wr_control);
+   compact(cond_modifier);
+
    if (devinfo->gen <= 6)
-      brw_compact_inst_set_flag_subreg_nr(&temp,
-                                          brw_inst_flag_subreg_nr(devinfo, src));
-   brw_compact_inst_set_cmpt_control(&temp, true);
+      compact(flag_subreg_nr);
+
+   brw_compact_inst_set_cmpt_control(devinfo, &temp, true);
+
    if (!set_src0_index(devinfo, &temp, src))
       return false;
    if (!set_src1_index(devinfo, &temp, src, is_immediate))
       return false;
-   brw_compact_inst_set_dst_reg_nr(&temp, brw_inst_dst_da_reg_nr(devinfo, src));
-   brw_compact_inst_set_src0_reg_nr(&temp, brw_inst_src0_da_reg_nr(devinfo, src));
+
+   brw_compact_inst_set_dst_reg_nr(devinfo, &temp,
+                                   brw_inst_dst_da_reg_nr(devinfo, src));
+   brw_compact_inst_set_src0_reg_nr(devinfo, &temp,
+                                    brw_inst_src0_da_reg_nr(devinfo, src));
+
    if (is_immediate) {
-      brw_compact_inst_set_src1_reg_nr(&temp,
+      brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
                                        brw_inst_imm_ud(devinfo, src) & 0xff);
    } else {
-      brw_compact_inst_set_src1_reg_nr(&temp,
+      brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
                                        brw_inst_src1_da_reg_nr(devinfo, src));
    }
 
+#undef compact
+
    *dst = temp;
 
    return true;
@@ -1043,7 +1054,7 @@ set_uncompacted_control(const struct brw_device_info *devinfo, brw_inst *dst,
                         brw_compact_inst *src)
 {
    uint32_t uncompacted =
-      control_index_table[brw_compact_inst_control_index(src)];
+      control_index_table[brw_compact_inst_control_index(devinfo, src)];
 
    if (devinfo->gen >= 8) {
       brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16));
@@ -1064,7 +1075,8 @@ static void
 set_uncompacted_datatype(const struct brw_device_info *devinfo, brw_inst *dst,
                          brw_compact_inst *src)
 {
-   uint32_t uncompacted = datatype_table[brw_compact_inst_datatype_index(src)];
+   uint32_t uncompacted =
+      datatype_table[brw_compact_inst_datatype_index(devinfo, src)];
 
    if (devinfo->gen >= 8) {
       brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18));
@@ -1080,7 +1092,8 @@ static void
 set_uncompacted_subreg(const struct brw_device_info *devinfo, brw_inst *dst,
                        brw_compact_inst *src)
 {
-   uint16_t uncompacted = subreg_table[brw_compact_inst_subreg_index(src)];
+   uint16_t uncompacted =
+      subreg_table[brw_compact_inst_subreg_index(devinfo, src)];
 
    brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10));
    brw_inst_set_bits(dst,  68, 64, (uncompacted >>  5) & 0x1f);
@@ -1091,7 +1104,7 @@ static void
 set_uncompacted_src0(const struct brw_device_info *devinfo, brw_inst *dst,
                      brw_compact_inst *src)
 {
-   uint32_t compacted = brw_compact_inst_src0_index(src);
+   uint32_t compacted = brw_compact_inst_src0_index(devinfo, src);
    uint16_t uncompacted = src_index_table[compacted];
 
    brw_inst_set_bits(dst, 88, 77, uncompacted);
@@ -1102,11 +1115,12 @@ set_uncompacted_src1(const struct brw_device_info *devinfo, brw_inst *dst,
                      brw_compact_inst *src, bool is_immediate)
 {
    if (is_immediate) {
-      signed high5 = brw_compact_inst_src1_index(src);
+      signed high5 = brw_compact_inst_src1_index(devinfo, src);
       /* Replicate top bit of src1_index into high 20 bits of the immediate. */
       brw_inst_set_imm_ud(devinfo, dst, (high5 << 27) >> 19);
    } else {
-      uint16_t uncompacted = src_index_table[brw_compact_inst_src1_index(src)];
+      uint16_t uncompacted =
+         src_index_table[brw_compact_inst_src1_index(devinfo, src)];
 
       brw_inst_set_bits(dst, 120, 109, uncompacted);
    }
@@ -1118,7 +1132,7 @@ set_uncompacted_3src_control_index(const struct brw_device_info *devinfo,
 {
    assert(devinfo->gen >= 8);
 
-   uint32_t compacted = brw_compact_inst_3src_control_index(src);
+   uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
    uint32_t uncompacted = gen8_3src_control_index_table[compacted];
 
    brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7);
@@ -1134,7 +1148,7 @@ set_uncompacted_3src_source_index(const struct brw_device_info *devinfo,
 {
    assert(devinfo->gen >= 8);
 
-   uint32_t compacted = brw_compact_inst_3src_source_index(src);
+   uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src);
    uint64_t uncompacted = gen8_3src_source_index_table[compacted];
 
    brw_inst_set_bits(dst,  83,  83, (uncompacted >> 43) & 0x1);
@@ -1160,7 +1174,7 @@ brw_uncompact_3src_instruction(const struct brw_device_info *devinfo,
    assert(devinfo->gen >= 8);
 
 #define uncompact(field) \
-   brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(src))
+   brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src))
 
    uncompact(opcode);
 
@@ -1190,13 +1204,16 @@ brw_uncompact_instruction(const struct brw_device_info *devinfo, brw_inst *dst,
 {
    memset(dst, 0, sizeof(*dst));
 
-   if (devinfo->gen >= 8 && is_3src(brw_compact_inst_3src_opcode(src))) {
+   if (devinfo->gen >= 8 && is_3src(brw_compact_inst_3src_opcode(devinfo, src))) {
       brw_uncompact_3src_instruction(devinfo, dst, src);
       return;
    }
 
-   brw_inst_set_opcode(devinfo, dst, brw_compact_inst_opcode(src));
-   brw_inst_set_debug_control(devinfo, dst, brw_compact_inst_debug_control(src));
+#define uncompact(field) \
+   brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src))
+
+   uncompact(opcode);
+   uncompact(debug_control);
 
    set_uncompacted_control(devinfo, dst, src);
    set_uncompacted_datatype(devinfo, dst, src);
@@ -1206,22 +1223,31 @@ brw_uncompact_instruction(const struct brw_device_info *devinfo, brw_inst *dst,
                        brw_inst_src1_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE;
 
    set_uncompacted_subreg(devinfo, dst, src);
-   brw_inst_set_acc_wr_control(devinfo, dst, brw_compact_inst_acc_wr_control(src));
-   brw_inst_set_cond_modifier(devinfo, dst, brw_compact_inst_cond_modifier(src));
+
+   uncompact(acc_wr_control);
+   uncompact(cond_modifier);
+
    if (devinfo->gen <= 6)
-      brw_inst_set_flag_subreg_nr(devinfo, dst,
-                                  brw_compact_inst_flag_subreg_nr(src));
+      uncompact(flag_subreg_nr);
+
    set_uncompacted_src0(devinfo, dst, src);
    set_uncompacted_src1(devinfo, dst, src, is_immediate);
-   brw_inst_set_dst_da_reg_nr(devinfo, dst, brw_compact_inst_dst_reg_nr(src));
-   brw_inst_set_src0_da_reg_nr(devinfo, dst, brw_compact_inst_src0_reg_nr(src));
+
+   brw_inst_set_dst_da_reg_nr(devinfo, dst,
+                              brw_compact_inst_dst_reg_nr(devinfo, src));
+   brw_inst_set_src0_da_reg_nr(devinfo, dst,
+                               brw_compact_inst_src0_reg_nr(devinfo, src));
+
    if (is_immediate) {
       brw_inst_set_imm_ud(devinfo, dst,
                           brw_inst_imm_ud(devinfo, dst) |
-                          brw_compact_inst_src1_reg_nr(src));
+                          brw_compact_inst_src1_reg_nr(devinfo, src));
    } else {
-      brw_inst_set_src1_da_reg_nr(devinfo, dst, brw_compact_inst_src1_reg_nr(src));
+      brw_inst_set_src1_da_reg_nr(devinfo, dst,
+                                  brw_compact_inst_src1_reg_nr(devinfo, src));
    }
+
+#undef uncompact
 }
 
 void brw_debug_compact_uncompact(const struct brw_device_info *devinfo,
@@ -1415,8 +1441,8 @@ brw_compact_instructions(struct brw_codegen *p, int start_offset,
          if ((offset & sizeof(brw_compact_inst)) != 0 && devinfo->is_g4x){
             brw_compact_inst *align = store + offset;
             memset(align, 0, sizeof(*align));
-            brw_compact_inst_set_opcode(align, BRW_OPCODE_NENOP);
-            brw_compact_inst_set_cmpt_control(align, true);
+            brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NENOP);
+            brw_compact_inst_set_cmpt_control(devinfo, align, true);
             offset += sizeof(brw_compact_inst);
             compacted_count--;
             compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
@@ -1524,8 +1550,8 @@ brw_compact_instructions(struct brw_codegen *p, int start_offset,
    if (p->next_insn_offset & sizeof(brw_compact_inst)) {
       brw_compact_inst *align = store + offset;
       memset(align, 0, sizeof(*align));
-      brw_compact_inst_set_opcode(align, BRW_OPCODE_NOP);
-      brw_compact_inst_set_cmpt_control(align, true);
+      brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NOP);
+      brw_compact_inst_set_cmpt_control(devinfo, align, true);
       p->next_insn_offset += sizeof(brw_compact_inst);
    }
    p->nr_insn = p->next_insn_offset / sizeof(brw_inst);
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index ab37b709d65..0d88ae29ed6 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -762,19 +762,27 @@ brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
    inst->data = (inst->data & ~mask) | (value << low);
 }
 
-#define F(name, high, low)                                      \
-static inline void                                              \
-brw_compact_inst_set_##name(brw_compact_inst *inst, unsigned v) \
-{                                                               \
-   brw_compact_inst_set_bits(inst, high, low, v);               \
-}                                                               \
-                                                                \
-static inline unsigned                                          \
-brw_compact_inst_##name(brw_compact_inst *inst)                 \
-{                                                               \
-   return brw_compact_inst_bits(inst, high, low);               \
+#define FC(name, high, low, assertions)                            \
+static inline void                                                 \
+brw_compact_inst_set_##name(const struct brw_device_info *devinfo, \
+                            brw_compact_inst *inst, unsigned v)    \
+{                                                                  \
+   assert(assertions);                                             \
+   (void) devinfo;                                                 \
+   brw_compact_inst_set_bits(inst, high, low, v);                  \
+}                                                                  \
+static inline unsigned                                             \
+brw_compact_inst_##name(const struct brw_device_info *devinfo,     \
+                        brw_compact_inst *inst)                    \
+{                                                                  \
+   assert(assertions);                                             \
+   (void) devinfo;                                                 \
+   return brw_compact_inst_bits(inst, high, low);                  \
 }
 
+/* A simple macro for fields which stay in the same place on all generations. */
+#define F(name, high, low) FC(name, high, low, true)
+
 F(src1_reg_nr,    63, 56)
 F(src0_reg_nr,    55, 48)
 F(dst_reg_nr,     47, 40)

From 35f3f06c8a2e614beac90cf25b9dfff98bb420dc Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 15:01:20 -0700
Subject: [PATCH 59/85] i965: Compact acc_wr_control only on Gen6+.

It only exists on Gen6+, and the next patches will add compaction
support for the (unused) field in the same location on earlier
platforms.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_eu_compact.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index facf3cd1b88..b122deca727 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -1016,7 +1016,10 @@ brw_try_compact_instruction(const struct brw_device_info *devinfo,
    if (!set_subreg_index(devinfo, &temp, src, is_immediate))
       return false;
 
-   compact(acc_wr_control);
+   if (devinfo->gen >= 6) {
+      compact(acc_wr_control);
+   }
+
    compact(cond_modifier);
 
    if (devinfo->gen <= 6)
@@ -1224,7 +1227,10 @@ brw_uncompact_instruction(const struct brw_device_info *devinfo, brw_inst *dst,
 
    set_uncompacted_subreg(devinfo, dst, src);
 
-   uncompact(acc_wr_control);
+   if (devinfo->gen >= 6) {
+      uncompact(acc_wr_control);
+   }
+
    uncompact(cond_modifier);
 
    if (devinfo->gen <= 6)

From d14907b94676fb563761866fca5bdf0492cdb8df Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 14:46:18 -0700
Subject: [PATCH 60/85] i965: Prepare for next commit by adding more
 whitespace.

We're going to add a field with a longer name that wouldn't align with
the rest.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_inst.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 0d88ae29ed6..088143cbace 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -783,20 +783,20 @@ brw_compact_inst_##name(const struct brw_device_info *devinfo,     \
 /* A simple macro for fields which stay in the same place on all generations. */
 #define F(name, high, low) FC(name, high, low, true)
 
-F(src1_reg_nr,    63, 56)
-F(src0_reg_nr,    55, 48)
-F(dst_reg_nr,     47, 40)
-F(src1_index,     39, 35)
-F(src0_index,     34, 30)
-F(cmpt_control,   29, 29) /* Same location as brw_inst */
-F(flag_subreg_nr, 28, 28) /* <= Gen6 only */
-F(cond_modifier,  27, 24) /* Same location as brw_inst */
-F(acc_wr_control, 23, 23)
-F(subreg_index,   22, 18)
-F(datatype_index, 17, 13)
-F(control_index,  12,  8)
-F(debug_control,   7,  7)
-F(opcode,          6,  0) /* Same location as brw_inst */
+F(src1_reg_nr,      63, 56)
+F(src0_reg_nr,      55, 48)
+F(dst_reg_nr,       47, 40)
+F(src1_index,       39, 35)
+F(src0_index,       34, 30)
+F(cmpt_control,     29, 29) /* Same location as brw_inst */
+F(flag_subreg_nr,   28, 28) /* <= Gen6 only */
+F(cond_modifier,    27, 24) /* Same location as brw_inst */
+F(acc_wr_control,   23, 23)
+F(subreg_index,     22, 18)
+F(datatype_index,   17, 13)
+F(control_index,    12,  8)
+F(debug_control,     7,  7)
+F(opcode,            6,  0) /* Same location as brw_inst */
 
 /**
  * (Gen8+) Compacted three-source instructions:

From 3ec9d96d435d5f57b35d4a8c75149fd75eaba187 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 14:47:17 -0700
Subject: [PATCH 61/85] i965: Add devinfo->gen assertions for acc_wr_control.

... and for flag_subreg_nr since it's right near by.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_inst.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 088143cbace..cb3d7e69a9a 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -181,7 +181,7 @@ F(saturate,             31,  31)
 F(debug_control,        30,  30)
 F(cmpt_control,         29,  29)
 FC(branch_control,      28,  28, devinfo->gen >= 8)
-F(acc_wr_control,       28,  28)
+FC(acc_wr_control,      28,  28, devinfo->gen >= 6)
 F(cond_modifier,        27,  24)
 FC(math_function,       27,  24, devinfo->gen >= 6)
 F(exec_size,            23,  21)
@@ -789,9 +789,9 @@ F(dst_reg_nr,       47, 40)
 F(src1_index,       39, 35)
 F(src0_index,       34, 30)
 F(cmpt_control,     29, 29) /* Same location as brw_inst */
-F(flag_subreg_nr,   28, 28) /* <= Gen6 only */
+FC(flag_subreg_nr,  28, 28, devinfo->gen <= 6)
 F(cond_modifier,    27, 24) /* Same location as brw_inst */
-F(acc_wr_control,   23, 23)
+FC(acc_wr_control,  23, 23, devinfo->gen >= 6)
 F(subreg_index,     22, 18)
 F(datatype_index,   17, 13)
 F(control_index,    12,  8)

From b29f92daec59a4181a45175b6bfc6e636c57fb33 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 15:08:28 -0700
Subject: [PATCH 62/85] i965: Add mask_control_ex field and handle it in
 compaction.

Documentation is sparse, but it appears to have existed on G45 and ILK
as a second bit extension of the mask_control field. Setting the pair of
bits to 0b11 enables "NoCMask".

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_eu_compact.c | 4 ++++
 src/mesa/drivers/dri/i965/brw_inst.h       | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index b122deca727..f787ea3d4f8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -1018,6 +1018,8 @@ brw_try_compact_instruction(const struct brw_device_info *devinfo,
 
    if (devinfo->gen >= 6) {
       compact(acc_wr_control);
+   } else {
+      compact(mask_control_ex);
    }
 
    compact(cond_modifier);
@@ -1229,6 +1231,8 @@ brw_uncompact_instruction(const struct brw_device_info *devinfo, brw_inst *dst,
 
    if (devinfo->gen >= 6) {
       uncompact(acc_wr_control);
+   } else {
+      uncompact(mask_control_ex);
    }
 
    uncompact(cond_modifier);
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index cb3d7e69a9a..819ce596547 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -182,6 +182,7 @@ F(debug_control,        30,  30)
 F(cmpt_control,         29,  29)
 FC(branch_control,      28,  28, devinfo->gen >= 8)
 FC(acc_wr_control,      28,  28, devinfo->gen >= 6)
+FC(mask_control_ex,     28,  28, devinfo->is_g4x || devinfo->gen == 5)
 F(cond_modifier,        27,  24)
 FC(math_function,       27,  24, devinfo->gen >= 6)
 F(exec_size,            23,  21)
@@ -792,6 +793,7 @@ F(cmpt_control,     29, 29) /* Same location as brw_inst */
 FC(flag_subreg_nr,  28, 28, devinfo->gen <= 6)
 F(cond_modifier,    27, 24) /* Same location as brw_inst */
 FC(acc_wr_control,  23, 23, devinfo->gen >= 6)
+FC(mask_control_ex, 23, 23, devinfo->is_g4x || devinfo->gen == 5)
 F(subreg_index,     22, 18)
 F(datatype_index,   17, 13)
 F(control_index,    12,  8)

From 05cc56cca3abac0dc8e34469a260fe3c635a12d8 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 18:59:53 -0700
Subject: [PATCH 63/85] i965: Add const to brw_compact_inst_bits.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_inst.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 819ce596547..524a4fb1d37 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -739,7 +739,7 @@ typedef struct {
  * Bits indices range from 0..63.
  */
 static inline unsigned
-brw_compact_inst_bits(brw_compact_inst *inst, unsigned high, unsigned low)
+brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low)
 {
    const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
@@ -774,7 +774,7 @@ brw_compact_inst_set_##name(const struct brw_device_info *devinfo, \
 }                                                                  \
 static inline unsigned                                             \
 brw_compact_inst_##name(const struct brw_device_info *devinfo,     \
-                        brw_compact_inst *inst)                    \
+                        const brw_compact_inst *inst)              \
 {                                                                  \
    assert(assertions);                                             \
    (void) devinfo;                                                 \

From 2ce659b5e422f7e7639d6e451160c197717df823 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 19 Oct 2015 19:02:16 -0700
Subject: [PATCH 64/85] i965: Mark compacted 3-src instructions as Gen8+.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_inst.h | 32 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 524a4fb1d37..db05a8a5f30 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -804,24 +804,24 @@ F(opcode,            6,  0) /* Same location as brw_inst */
  * (Gen8+) Compacted three-source instructions:
  *  @{
  */
-F(3src_src2_reg_nr,    63, 57)
-F(3src_src1_reg_nr,    56, 50)
-F(3src_src0_reg_nr,    49, 43)
-F(3src_src2_subreg_nr, 42, 40)
-F(3src_src1_subreg_nr, 39, 37)
-F(3src_src0_subreg_nr, 36, 34)
-F(3src_src2_rep_ctrl,  33, 33)
-F(3src_src1_rep_ctrl,  32, 32)
-F(3src_saturate,       31, 31)
-F(3src_debug_control,  30, 30)
-F(3src_cmpt_control,   29, 29)
-F(3src_src0_rep_ctrl,  28, 28)
+FC(3src_src2_reg_nr,    63, 57, devinfo->gen >= 8)
+FC(3src_src1_reg_nr,    56, 50, devinfo->gen >= 8)
+FC(3src_src0_reg_nr,    49, 43, devinfo->gen >= 8)
+FC(3src_src2_subreg_nr, 42, 40, devinfo->gen >= 8)
+FC(3src_src1_subreg_nr, 39, 37, devinfo->gen >= 8)
+FC(3src_src0_subreg_nr, 36, 34, devinfo->gen >= 8)
+FC(3src_src2_rep_ctrl,  33, 33, devinfo->gen >= 8)
+FC(3src_src1_rep_ctrl,  32, 32, devinfo->gen >= 8)
+FC(3src_saturate,       31, 31, devinfo->gen >= 8)
+FC(3src_debug_control,  30, 30, devinfo->gen >= 8)
+FC(3src_cmpt_control,   29, 29, devinfo->gen >= 8)
+FC(3src_src0_rep_ctrl,  28, 28, devinfo->gen >= 8)
 /* Reserved */
-F(3src_dst_reg_nr,     18, 12)
-F(3src_source_index,   11, 10)
-F(3src_control_index,   9,  8)
+FC(3src_dst_reg_nr,     18, 12, devinfo->gen >= 8)
+FC(3src_source_index,   11, 10, devinfo->gen >= 8)
+FC(3src_control_index,   9,  8, devinfo->gen >= 8)
 /* Bit 7 is Reserved (for future Opcode expansion) */
-F(3src_opcode,          6,  0)
+FC(3src_opcode,          6,  0, devinfo->gen >= 8)
 /** @} */
 
 #undef F

From f1147a238ab35a56fa7d1c64f6025ff3b909dad8 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Thu, 8 Oct 2015 16:44:30 -0700
Subject: [PATCH 65/85] mesa/glformats: Undo code changes from
 _mesa_base_tex_format() move

The refactoring commit, c6bf1cd, accidentally reverted cd49b97
and 99b1f47. These changes caused more code to be added to the
function and removed the existing support for ASTC. This patch
reverts those modifications.

v2. Actually include ASTC support again.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92221
Cc: "11.0" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/mesa/main/glformats.c | 149 ++------------------------------------
 1 file changed, 8 insertions(+), 141 deletions(-)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index faa63825380..2ed42eaffdd 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -2275,45 +2275,16 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       ; /* fallthrough */
    }
 
-   if (ctx->Extensions.TDFX_texture_compression_FXT1) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGB_FXT1_3DFX:
-         return GL_RGB;
-      case GL_COMPRESSED_RGBA_FXT1_3DFX:
-         return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
+   if (_mesa_is_compressed_format(ctx, internalFormat)) {
+      GLenum base_compressed =
+         _mesa_gl_compressed_format_base_format(internalFormat);
+      if (base_compressed)
+            return base_compressed;
    }
 
-   /* Assume that the ANGLE flag will always be set if the EXT flag is set.
-    */
-   if (ctx->Extensions.ANGLE_texture_compression_dxt) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-         return GL_RGB;
-      case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-         return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (_mesa_is_desktop_gl(ctx)
-       && ctx->Extensions.ANGLE_texture_compression_dxt) {
-      switch (internalFormat) {
-      case GL_RGB_S3TC:
-      case GL_RGB4_S3TC:
-         return GL_RGB;
-      case GL_RGBA_S3TC:
-      case GL_RGBA4_S3TC:
-         return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
-   }
+   if (ctx->Extensions.KHR_texture_compression_astc_ldr &&
+      _mesa_is_astc_format(internalFormat))
+        return GL_RGBA;
 
    if (ctx->Extensions.MESA_ycbcr_texture) {
       if (internalFormat == GL_YCBCR_MESA)
@@ -2390,16 +2361,10 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       case GL_SRGB8_EXT:
       case GL_COMPRESSED_SRGB_EXT:
          return GL_RGB;
-      case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
-         return ctx->Extensions.EXT_texture_compression_s3tc ? GL_RGB : -1;
       case GL_SRGB_ALPHA_EXT:
       case GL_SRGB8_ALPHA8_EXT:
       case GL_COMPRESSED_SRGB_ALPHA_EXT:
          return GL_RGBA;
-      case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
-      case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
-      case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
-         return ctx->Extensions.EXT_texture_compression_s3tc ? GL_RGBA : -1;
       case GL_SLUMINANCE_ALPHA_EXT:
       case GL_SLUMINANCE8_ALPHA8_EXT:
       case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
@@ -2544,104 +2509,6 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       }
    }
 
-   if (ctx->Extensions.ARB_texture_compression_rgtc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RED_RGTC1:
-      case GL_COMPRESSED_SIGNED_RED_RGTC1:
-         return GL_RED;
-      case GL_COMPRESSED_RG_RGTC2:
-      case GL_COMPRESSED_SIGNED_RG_RGTC2:
-         return GL_RG;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->Extensions.EXT_texture_compression_latc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_LUMINANCE_LATC1_EXT:
-      case GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT:
-         return GL_LUMINANCE;
-      case GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT:
-      case GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT:
-         return GL_LUMINANCE_ALPHA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->Extensions.ATI_texture_compression_3dc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI:
-         return GL_LUMINANCE_ALPHA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->Extensions.OES_compressed_ETC1_RGB8_texture) {
-      switch (internalFormat) {
-      case GL_ETC1_RGB8_OES:
-         return GL_RGB;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (_mesa_is_gles3(ctx) || ctx->Extensions.ARB_ES3_compatibility) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGB8_ETC2:
-      case GL_COMPRESSED_SRGB8_ETC2:
-         return GL_RGB;
-      case GL_COMPRESSED_RGBA8_ETC2_EAC:
-      case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-      case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-      case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-         return GL_RGBA;
-      case GL_COMPRESSED_R11_EAC:
-      case GL_COMPRESSED_SIGNED_R11_EAC:
-         return GL_RED;
-      case GL_COMPRESSED_RG11_EAC:
-      case GL_COMPRESSED_SIGNED_RG11_EAC:
-         return GL_RG;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (_mesa_is_desktop_gl(ctx) &&
-       ctx->Extensions.ARB_texture_compression_bptc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGBA_BPTC_UNORM:
-      case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
-         return GL_RGBA;
-      case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
-      case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
-         return GL_RGB;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->API == API_OPENGLES) {
-      switch (internalFormat) {
-      case GL_PALETTE4_RGB8_OES:
-      case GL_PALETTE4_R5_G6_B5_OES:
-      case GL_PALETTE8_RGB8_OES:
-      case GL_PALETTE8_R5_G6_B5_OES:
-	 return GL_RGB;
-      case GL_PALETTE4_RGBA8_OES:
-      case GL_PALETTE8_RGB5_A1_OES:
-      case GL_PALETTE4_RGBA4_OES:
-      case GL_PALETTE4_RGB5_A1_OES:
-      case GL_PALETTE8_RGBA8_OES:
-      case GL_PALETTE8_RGBA4_OES:
-	 return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
    return -1; /* error */
 }
 

From c643518452ce9a0c107c5666e2caa7806a04e7d4 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Tue, 20 Oct 2015 14:29:36 -0700
Subject: [PATCH 66/85] i965: Correct the comment about fb write payload

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_defines.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 393f17ac98c..7a5ee1b1756 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -916,8 +916,8 @@ enum opcode {
     * Source 0: [required] Color 0.
     * Source 1: [optional] Color 1 (for dual source blend messages).
     * Source 2: [optional] Src0 Alpha.
-    * Source 3: [optional] Source Depth (passthrough from the thread payload).
-    * Source 4: [optional] Destination Depth (gl_FragDepth).
+    * Source 3: [optional] Source Depth (gl_FragDepth)
+    * Source 4: [optional (gen4-5)] Destination Depth passthrough from thread
     * Source 5: [optional] Sample Mask (gl_SampleMask).
     * Source 6: [required] Number of color components (as a UD immediate).
     */

From 3c5d24363a4c3e332d9b0820b4682d3a336d00f7 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Tue, 20 Oct 2015 14:29:38 -0700
Subject: [PATCH 67/85] i965: (trivial) rename computes stencil to gen9

All the documentation I can find says that this bit (and functionality) only
exists on SKL+. Since the bit isn't yet used, there is no real impact here.

The original code was added by Ken here (a surprisingly long time ago):
commit f3c6d6f1e151f6a44a76038dccebe4434038dcb1
Author: Kenneth Graunke <kenneth@whitecape.org>
Date:   Thu Nov 29 21:00:27 2012 -0800

    i965: Update 3DSTATE_PS, 3DSTATE_WM, and add 3DSTATE_PS_EXTRA.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_defines.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 7a5ee1b1756..e61ad545744 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2383,7 +2383,7 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define GEN8_PSX_ATTRIBUTE_ENABLE                      (1 << 8)
 # define GEN8_PSX_SHADER_DISABLES_ALPHA_TO_COVERAGE     (1 << 7)
 # define GEN8_PSX_SHADER_IS_PER_SAMPLE                  (1 << 6)
-# define GEN8_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
+# define GEN9_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
 # define GEN9_PSX_SHADER_PULLS_BARY                     (1 << 3)
 # define GEN8_PSX_SHADER_HAS_UAV                        (1 << 2)
 # define GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK       (1 << 1)

From 4eb84a03becec582a3a26ceee470334f0fba4721 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 20 Oct 2015 16:35:44 -0700
Subject: [PATCH 68/85] nir/info: Add more information about geometry shaders

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/nir/glsl_to_nir.cpp |  4 ++++
 src/glsl/nir/nir.h           | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index c9cdf35d6db..9b50a93e7f6 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -170,8 +170,12 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
 
    switch (stage) {
    case MESA_SHADER_GEOMETRY:
+      shader->info.gs.vertices_in = shader_prog->Geom.VerticesIn;
+      shader->info.gs.output_primitive = sh->Geom.OutputType;
       shader->info.gs.vertices_out = sh->Geom.VerticesOut;
       shader->info.gs.invocations = sh->Geom.Invocations;
+      shader->info.gs.uses_end_primitive = shader_prog->Geom.UsesEndPrimitive;
+      shader->info.gs.uses_streams = shader_prog->Geom.UsesStreams;
       break;
 
    case MESA_SHADER_FRAGMENT: {
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 2ab48fb9d9c..f65d44c4bdf 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1495,11 +1495,23 @@ typedef struct nir_shader_info {
 
    union {
       struct {
+         /** The number of vertices recieves per input primitive */
+         unsigned vertices_in;
+
+         /** The output primitive type (GL enum value) */
+         unsigned output_primitive;
+
          /** The maximum number of vertices the geometry shader might write. */
          unsigned vertices_out;
 
          /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
          unsigned invocations;
+
+         /** Whether or not this shader uses EndPrimitive */
+         bool uses_end_primitive;
+
+         /** Whether or not this shader uses non-zero streams */
+         bool uses_streams;
       } gs;
 
       struct {

From 2686477d3757de50c31aa193bde8ad57fe539e44 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 20 Oct 2015 17:40:19 -0700
Subject: [PATCH 69/85] nir: Constify nir_gs_count_vertices

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/glsl/nir/nir.h                   | 2 +-
 src/glsl/nir/nir_gs_count_vertices.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index f65d44c4bdf..e3777f926e2 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1899,7 +1899,7 @@ void nir_dump_dom_frontier(nir_shader *shader, FILE *fp);
 void nir_dump_cfg_impl(nir_function_impl *impl, FILE *fp);
 void nir_dump_cfg(nir_shader *shader, FILE *fp);
 
-int nir_gs_count_vertices(nir_shader *shader);
+int nir_gs_count_vertices(const nir_shader *shader);
 
 bool nir_split_var_copies(nir_shader *shader);
 
diff --git a/src/glsl/nir/nir_gs_count_vertices.c b/src/glsl/nir/nir_gs_count_vertices.c
index e0bdf170d22..1c360673ddc 100644
--- a/src/glsl/nir/nir_gs_count_vertices.c
+++ b/src/glsl/nir/nir_gs_count_vertices.c
@@ -51,7 +51,7 @@ as_set_vertex_count(nir_instr *instr)
  * counting at the NIR level.
  */
 int
-nir_gs_count_vertices(nir_shader *shader)
+nir_gs_count_vertices(const nir_shader *shader)
 {
    int count = -1;
 

From 8e8b527b27b5b31f88e85e9878652a11c7fa5ca4 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 20 Oct 2015 16:46:50 -0700
Subject: [PATCH 70/85] i965/gs: Set static_vertex_count unconditionally on
 GEN8+

We always have NIR, so there's no reason for the check.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_gs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 10a7f28fdab..02f506de3b6 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -96,7 +96,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
                                &c.prog_data.base.base, false);
 
    if (brw->gen >= 8) {
-      c.prog_data.static_vertex_count = !gp->program.Base.nir ? -1 :
+      c.prog_data.static_vertex_count =
          nir_gs_count_vertices(gp->program.Base.nir);
    }
 

From 72148de217786473bb2bb02b99f49fd28bdda0e2 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 20 Oct 2015 16:21:09 -0700
Subject: [PATCH 71/85] i965/gs: Move the mem_ctx argument to brw_compile_gs

This makes it better match the other brw_compile_* functions.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.h          | 2 +-
 src/mesa/drivers/dri/i965/brw_gs.c                | 4 ++--
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 11c485d2f08..8c10495caa6 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -615,10 +615,10 @@ struct brw_gs_compile
  */
 const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
                struct brw_gs_compile *c,
                const struct nir_shader *shader,
                struct gl_shader_program *shader_prog,
-               void *mem_ctx,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str);
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 02f506de3b6..ae7f2422e8c 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -303,9 +303,9 @@ brw_codegen_gs_prog(struct brw_context *brw,
    unsigned program_size;
    char *error_str;
    const unsigned *program =
-      brw_compile_gs(brw->intelScreen->compiler, brw, &c,
+      brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, &c,
                      shader->Program->nir, prog,
-                     mem_ctx, st_index, &program_size, &error_str);
+                     st_index, &program_size, &error_str);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index a715cf5a6cb..47f2dd89e4d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -598,10 +598,10 @@ vec4_gs_visitor::gs_end_primitive()
 
 extern "C" const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
                struct brw_gs_compile *c,
                const nir_shader *shader,
                struct gl_shader_program *shader_prog,
-               void *mem_ctx,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str)

From 6ac2bbec16d73f0cc58fc520c4165239461c59b3 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 20 Oct 2015 16:40:30 -0700
Subject: [PATCH 72/85] i965/gs: Use NIR instead of the brw_geometry_program
 for GS metadata

With this, we can remove the geometry program from brw_gs_compile.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.h          |  2 --
 src/mesa/drivers/dri/i965/brw_gs.c                |  1 -
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp |  6 +++---
 src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp     | 12 ++++++------
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 8c10495caa6..4a87b293c2b 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -602,8 +602,6 @@ struct brw_gs_compile
    struct brw_gs_prog_data prog_data;
    struct brw_vue_map input_vue_map;
 
-   struct brw_geometry_program *gp;
-
    unsigned control_data_bits_per_vertex;
    unsigned control_data_header_size_bits;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index ae7f2422e8c..effd5bfb46f 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -62,7 +62,6 @@ brw_codegen_gs_prog(struct brw_context *brw,
    struct brw_gs_compile c;
    memset(&c, 0, sizeof(c));
    c.key = *key;
-   c.gp = gp;
 
    c.prog_data.include_primitive_id =
       (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 47f2dd89e4d..0a624857b08 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -78,7 +78,7 @@ vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
     * so the total number of input slots that will be delivered to the GS (and
     * thus the stride of the input arrays) is urb_read_length * 2.
     */
-   const unsigned num_input_vertices = c->gp->program.VerticesIn;
+   const unsigned num_input_vertices = nir->info.gs.vertices_in;
    assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
    unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
 
@@ -182,9 +182,9 @@ vec4_gs_visitor::emit_prolog()
     * to account for the fact that the vertex shader stored it in the w
     * component of VARYING_SLOT_PSIZ.
     */
-   if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
+   if (nir->info.inputs_read & VARYING_BIT_PSIZ) {
       this->current_annotation = "swizzle gl_PointSize input";
-      for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
+      for (int vertex = 0; vertex < (int)nir->info.gs.vertices_in; vertex++) {
          dst_reg dst(ATTR,
                      BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
          dst.type = BRW_REGISTER_TYPE_F;
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 671a535a5bd..31cdc96d186 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -63,7 +63,7 @@ gen6_gs_visitor::emit_prolog()
    this->vertex_output = src_reg(this,
                                  glsl_type::uint_type,
                                  (prog_data->vue_map.num_slots + 1) *
-                                 c->gp->program.VerticesOut);
+                                 nir->info.gs.vertices_out);
    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
 
@@ -177,7 +177,7 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
    dst_reg dst(this->vertex_output);
    dst.reladdr = ralloc(mem_ctx, src_reg);
    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
-   if (c->gp->program.OutputType == GL_POINTS) {
+   if (nir->info.gs.output_primitive == GL_POINTS) {
       /* If we are outputting points, then every vertex has PrimStart and
        * PrimEnd set.
        */
@@ -205,7 +205,7 @@ gen6_gs_visitor::gs_end_primitive()
    /* Calling EndPrimitive() is optional for point output. In this case we set
     * the PrimEnd flag when we process EmitVertex().
     */
-   if (c->gp->program.OutputType == GL_POINTS)
+   if (nir->info.gs.output_primitive == GL_POINTS)
       return;
 
    /* Otherwise we know that the last vertex we have processed was the last
@@ -217,7 +217,7 @@ gen6_gs_visitor::gs_end_primitive()
     * comparison below (hence the num_output_vertices + 1 in the comparison
     * below).
     */
-   unsigned num_output_vertices = c->gp->program.VerticesOut;
+   unsigned num_output_vertices = nir->info.gs.vertices_out;
    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
             BRW_CONDITIONAL_L));
    vec4_instruction *inst = emit(CMP(dst_null_d(),
@@ -320,7 +320,7 @@ gen6_gs_visitor::emit_thread_end()
     * first_vertex is not zero. This is only relevant for outputs other than
     * points because in the point case we set PrimEnd on all vertices.
     */
-   if (c->gp->program.OutputType != GL_POINTS) {
+   if (nir->info.gs.output_primitive != GL_POINTS) {
       emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
       emit(IF(BRW_PREDICATE_NORMAL));
       gs_end_primitive();
@@ -627,7 +627,7 @@ gen6_gs_visitor::xfb_write()
    emit(BRW_OPCODE_ENDIF);
 
    /* Write transform feedback data for all processed vertices. */
-   for (int i = 0; i < c->gp->program.VerticesOut; i++) {
+   for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
       emit(MOV(dst_reg(sol_temp), i));
       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
                BRW_CONDITIONAL_L));

From fac9b21e037f9ce456039fbf35cd5fa573dee229 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 20 Oct 2015 17:12:03 -0700
Subject: [PATCH 73/85] i965/gs: Pull prog_data out of brw_gs_compile

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.h      |  2 +-
 src/mesa/drivers/dri/i965/brw_gs.c            | 60 ++++++++++---------
 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp |  2 +-
 .../drivers/dri/i965/brw_vec4_gs_visitor.cpp  | 51 ++++++++--------
 .../drivers/dri/i965/brw_vec4_gs_visitor.h    |  2 +
 src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp | 39 +++++-------
 src/mesa/drivers/dri/i965/gen6_gs_visitor.h   |  3 +-
 7 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 4a87b293c2b..859cfa5c07e 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -599,7 +599,6 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
 struct brw_gs_compile
 {
    struct brw_gs_prog_key key;
-   struct brw_gs_prog_data prog_data;
    struct brw_vue_map input_vue_map;
 
    unsigned control_data_bits_per_vertex;
@@ -615,6 +614,7 @@ const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                struct brw_gs_compile *c,
+               struct brw_gs_prog_data *prog_data,
                const struct nir_shader *shader,
                struct gl_shader_program *shader_prog,
                int shader_time_index,
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index effd5bfb46f..d7ea2f043f9 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -59,17 +59,19 @@ brw_codegen_gs_prog(struct brw_context *brw,
 {
    struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    struct brw_stage_state *stage_state = &brw->gs.base;
+   struct brw_gs_prog_data prog_data;
    struct brw_gs_compile c;
+   memset(&prog_data, 0, sizeof(prog_data));
    memset(&c, 0, sizeof(c));
    c.key = *key;
 
-   c.prog_data.include_primitive_id =
+   prog_data.include_primitive_id =
       (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
 
-   c.prog_data.invocations = gp->program.Invocations;
+   prog_data.invocations = gp->program.Invocations;
 
    assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog,
-                                   &gp->program.Base, &c.prog_data);
+                                   &gp->program.Base, &prog_data);
 
    /* Allocate the references to the uniforms that will end up in the
     * prog_data associated with the compiled program, and which will be freed
@@ -82,20 +84,20 @@ brw_codegen_gs_prog(struct brw_context *brw,
    struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    int param_count = gp->program.Base.nir->num_uniforms * 4;
 
-   c.prog_data.base.base.param =
+   prog_data.base.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
-   c.prog_data.base.base.pull_param =
+   prog_data.base.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
-   c.prog_data.base.base.image_param =
+   prog_data.base.base.image_param =
       rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
-   c.prog_data.base.base.nr_params = param_count;
-   c.prog_data.base.base.nr_image_params = gs->NumImages;
+   prog_data.base.base.nr_params = param_count;
+   prog_data.base.base.nr_image_params = gs->NumImages;
 
    brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
-                               &c.prog_data.base.base, false);
+                               &prog_data.base.base, false);
 
    if (brw->gen >= 8) {
-      c.prog_data.static_vertex_count =
+      prog_data.static_vertex_count =
          nir_gs_count_vertices(gp->program.Base.nir);
    }
 
@@ -105,7 +107,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
           * to multiple streams, and EndPrimitive() has no effect.  So we
           * configure the hardware to interpret the control data as stream ID.
           */
-         c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+         prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
 
          /* We only have to emit control bits if we are using streams */
          if (prog->Geom.UsesStreams)
@@ -119,7 +121,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
           * streams is not supported.  So we configure the hardware to interpret
           * the control data as EndPrimitive information (a.k.a. "cut bits").
           */
-         c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+         prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
 
          /* We only need to output control data if the shader actually calls
           * EndPrimitive().
@@ -132,21 +134,21 @@ brw_codegen_gs_prog(struct brw_context *brw,
 
       /* If it is using transform feedback, enable it */
       if (prog->TransformFeedback.NumVarying)
-         c.prog_data.gen6_xfb_enabled = true;
+         prog_data.gen6_xfb_enabled = true;
       else
-         c.prog_data.gen6_xfb_enabled = false;
+         prog_data.gen6_xfb_enabled = false;
    }
    c.control_data_header_size_bits =
       gp->program.VerticesOut * c.control_data_bits_per_vertex;
 
    /* 1 HWORD = 32 bytes = 256 bits */
-   c.prog_data.control_data_header_size_hwords =
+   prog_data.control_data_header_size_hwords =
       ALIGN(c.control_data_header_size_bits, 256) / 256;
 
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.prog_data.base.vue_map, outputs_written,
+                       &prog_data.base.vue_map, outputs_written,
                        prog ? prog->SeparateShader : false);
 
    /* Compute the output vertex size.
@@ -197,10 +199,10 @@ brw_codegen_gs_prog(struct brw_context *brw,
     * per interpolation type, so this is plenty.
     *
     */
-   unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16;
+   unsigned output_vertex_size_bytes = prog_data.base.vue_map.num_slots * 16;
    assert(brw->gen == 6 ||
           output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
-   c.prog_data.output_vertex_size_hwords =
+   prog_data.output_vertex_size_hwords =
       ALIGN(output_vertex_size_bytes, 32) / 32;
 
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
@@ -238,10 +240,10 @@ brw_codegen_gs_prog(struct brw_context *brw,
    unsigned output_size_bytes;
    if (brw->gen >= 7) {
       output_size_bytes =
-         c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
-      output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords;
+         prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
+      output_size_bytes += 32 * prog_data.control_data_header_size_hwords;
    } else {
-      output_size_bytes = c.prog_data.output_vertex_size_hwords * 32;
+      output_size_bytes = prog_data.output_vertex_size_hwords * 32;
    }
 
    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
@@ -262,11 +264,11 @@ brw_codegen_gs_prog(struct brw_context *brw,
     * a multiple of 128 bytes in gen6.
     */
    if (brw->gen >= 7)
-      c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+      prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
    else
-      c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+      prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
 
-   c.prog_data.output_topology =
+   prog_data.output_topology =
       get_hw_prim_for_gl_prim(gp->program.OutputType);
 
    /* The GLSL linker will have already matched up GS inputs and the outputs
@@ -289,7 +291,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
     */
-   c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+   prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
 
    if (unlikely(INTEL_DEBUG & DEBUG_GS))
       brw_dump_ir("geometry", prog, gs, NULL);
@@ -303,7 +305,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    char *error_str;
    const unsigned *program =
       brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, &c,
-                     shader->Program->nir, prog,
+                     &prog_data, shader->Program->nir, prog,
                      st_index, &program_size, &error_str);
    if (program == NULL) {
       ralloc_free(mem_ctx);
@@ -311,16 +313,16 @@ brw_codegen_gs_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (c.prog_data.base.base.total_scratch) {
+   if (prog_data.base.base.total_scratch) {
       brw_get_scratch_bo(brw, &stage_state->scratch_bo,
-			 c.prog_data.base.base.total_scratch *
+			 prog_data.base.base.total_scratch *
                          brw->max_gs_threads);
    }
 
    brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
                     &c.key, sizeof(c.key),
                     program, program_size,
-                    &c.prog_data, sizeof(c.prog_data),
+                    &prog_data, sizeof(prog_data),
                     &stage_state->prog_offset, &brw->gs.prog_data);
    ralloc_free(mem_ctx);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index 1b929b3df2c..6bc39473137 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -104,7 +104,7 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       break;
 
    case nir_intrinsic_load_primitive_id:
-      assert(c->prog_data.include_primitive_id);
+      assert(gs_prog_data->include_primitive_id);
       dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
       emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 0a624857b08..faad1a8d1dc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -35,14 +35,16 @@ namespace brw {
 vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
                                  void *log_data,
                                  struct brw_gs_compile *c,
+                                 struct brw_gs_prog_data *prog_data,
                                  const nir_shader *shader,
                                  void *mem_ctx,
                                  bool no_spills,
                                  int shader_time_index)
    : vec4_visitor(compiler, log_data, &c->key.tex,
-                  &c->prog_data.base, shader,  mem_ctx,
+                  &prog_data->base, shader,  mem_ctx,
                   no_spills, shader_time_index),
-     c(c)
+     c(c),
+     gs_prog_data(prog_data)
 {
 }
 
@@ -80,7 +82,7 @@ vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
     */
    const unsigned num_input_vertices = nir->info.gs.vertices_in;
    assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
-   unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
+   unsigned input_array_stride = prog_data->urb_read_length * 2;
 
    for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
       int varying = c->input_vue_map.slot_to_varying[slot];
@@ -106,7 +108,7 @@ vec4_gs_visitor::setup_payload()
     * to be interleaved, so one register contains two attribute slots.
     */
    int attributes_per_reg =
-      c->prog_data.base.dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+      prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 
    /* If a geometry shader tries to read from an input that wasn't written by
     * the vertex shader, that produces undefined results, but it shouldn't
@@ -124,7 +126,7 @@ vec4_gs_visitor::setup_payload()
    reg++;
 
    /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
-   if (c->prog_data.include_primitive_id)
+   if (gs_prog_data->include_primitive_id)
       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
 
    reg = setup_uniforms(reg);
@@ -222,7 +224,7 @@ vec4_gs_visitor::emit_thread_end()
     */
    int base_mrf = 1;
 
-   bool static_vertex_count = c->prog_data.static_vertex_count != -1;
+   bool static_vertex_count = gs_prog_data->static_vertex_count != -1;
 
    /* If the previous instruction was a URB write, we don't need to issue
     * a second one - we can just set the EOT bit on the previous write.
@@ -271,7 +273,7 @@ vec4_gs_visitor::emit_urb_write_header(int mrf)
    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
    inst->force_writemask_all = true;
    emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
-        (uint32_t) c->prog_data.output_vertex_size_hwords);
+        (uint32_t) gs_prog_data->output_vertex_size_hwords);
 }
 
 
@@ -285,12 +287,12 @@ vec4_gs_visitor::emit_urb_write_opcode(bool complete)
    (void) complete;
 
    vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
-   inst->offset = c->prog_data.control_data_header_size_hwords;
+   inst->offset = gs_prog_data->control_data_header_size_hwords;
 
    /* We need to increment Global Offset by 1 to make room for Broadwell's
     * extra "Vertex Count" payload at the beginning of the URB entry.
     */
-   if (devinfo->gen >= 8 && c->prog_data.static_vertex_count == -1)
+   if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
       inst->offset++;
 
    inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
@@ -409,7 +411,7 @@ vec4_gs_visitor::emit_control_data_bits()
     * URB entry.  Since this is an OWord message, Global Offset is counted
     * in 128-bit units, so we must set it to 2.
     */
-   if (devinfo->gen >= 8 && c->prog_data.static_vertex_count == -1)
+   if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
       inst->offset = 2;
    inst->base_mrf = base_mrf;
    inst->mlen = 2;
@@ -536,7 +538,7 @@ vec4_gs_visitor::gs_emit_vertex(int stream_id)
     * do for GL_POINTS outputs that don't use streams).
     */
    if (c->control_data_header_size_bits > 0 &&
-       c->prog_data.control_data_format ==
+       gs_prog_data->control_data_format ==
           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
        this->current_annotation = "emit vertex: Stream control data bits";
        set_stream_control_data_bits(stream_id);
@@ -552,7 +554,7 @@ vec4_gs_visitor::gs_end_primitive()
     * consists of cut bits.  Fortunately, the only time it isn't is when the
     * output type is points, in which case EndPrimitive() is a no-op.
     */
-   if (c->prog_data.control_data_format !=
+   if (gs_prog_data->control_data_format !=
        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
       return;
    }
@@ -600,6 +602,7 @@ extern "C" const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                struct brw_gs_compile *c,
+               struct brw_gs_prog_data *prog_data,
                const nir_shader *shader,
                struct gl_shader_program *shader_prog,
                int shader_time_index,
@@ -611,14 +614,14 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
        * so without spilling. If the GS invocations count > 1, then we can't use
        * dual object mode.
        */
-      if (c->prog_data.invocations <= 1 &&
+      if (prog_data->invocations <= 1 &&
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
-         c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+         prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(compiler, log_data, c, shader,
+         vec4_gs_visitor v(compiler, log_data, c, prog_data, shader,
                            mem_ctx, true /* no_spills */, shader_time_index);
          if (v.run()) {
-            vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx,
+            vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
                              INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
             return g.generate_assembly(v.cfg, final_assembly_size, shader);
          }
@@ -648,28 +651,28 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
     * mode is more performant when invocations > 1. Gen6 only supports
     * SINGLE mode.
     */
-   if (c->prog_data.invocations <= 1 || compiler->devinfo->gen < 7)
-      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
+   if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
-      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
 
    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;
 
    if (compiler->devinfo->gen >= 7)
-      gs = new vec4_gs_visitor(compiler, log_data, c, shader,
-                               mem_ctx, false /* no_spills */,
+      gs = new vec4_gs_visitor(compiler, log_data, c, prog_data,
+                               shader, mem_ctx, false /* no_spills */,
                                shader_time_index);
    else
-      gs = new gen6_gs_visitor(compiler, log_data, c, shader_prog, shader,
-                               mem_ctx, false /* no_spills */,
+      gs = new gen6_gs_visitor(compiler, log_data, c, prog_data, shader_prog,
+                               shader, mem_ctx, false /* no_spills */,
                                shader_time_index);
 
    if (!gs->run()) {
       if (error_str)
          *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
-      vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx,
+      vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
                        INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
       ret = g.generate_assembly(gs->cfg, final_assembly_size, shader);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index c52552768c8..6ca83a9d9a3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -41,6 +41,7 @@ public:
    vec4_gs_visitor(const struct brw_compiler *compiler,
                    void *log_data,
                    struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
                    const nir_shader *shader,
                    void *mem_ctx,
                    bool no_spills,
@@ -70,6 +71,7 @@ protected:
    src_reg vertex_count;
    src_reg control_data_bits;
    const struct brw_gs_compile * const c;
+   struct brw_gs_prog_data * const gs_prog_data;
 };
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 31cdc96d186..2fef188c17e 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -95,7 +95,7 @@ gen6_gs_visitor::emit_prolog()
    this->prim_count = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->prim_count), 0u));
 
-   if (c->prog_data.gen6_xfb_enabled) {
+   if (gs_prog_data->gen6_xfb_enabled) {
       /* Create a virtual register to hold destination indices in SOL */
       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
       /* Create a virtual register to hold number of written primitives */
@@ -128,7 +128,7 @@ gen6_gs_visitor::emit_prolog()
     * in the 3DSTATE_GS state packet. That information can be obtained by other
     * means though, so we can safely use r1 for this purpose.
     */
-   if (c->prog_data.include_primitive_id) {
+   if (gs_prog_data->include_primitive_id) {
       this->primitive_id =
          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
@@ -191,7 +191,7 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
        * vertex.
        */
       emit(OR(dst, this->first_vertex,
-              (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
+              (gs_prog_data->output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
       emit(MOV(dst_reg(this->first_vertex), 0u));
    }
    emit(ADD(dst_reg(this->vertex_output_offset),
@@ -353,7 +353,7 @@ gen6_gs_visitor::emit_thread_end()
       this->current_annotation = "gen6 thread end: ff_sync";
 
       vec4_instruction *inst;
-      if (c->prog_data.gen6_xfb_enabled) {
+      if (gs_prog_data->gen6_xfb_enabled) {
          src_reg sol_temp(this, glsl_type::uvec4_type);
          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
               dst_reg(this->svbi),
@@ -443,7 +443,7 @@ gen6_gs_visitor::emit_thread_end()
       }
       emit(BRW_OPCODE_WHILE);
 
-      if (c->prog_data.gen6_xfb_enabled)
+      if (gs_prog_data->gen6_xfb_enabled)
          xfb_write();
    }
    emit(BRW_OPCODE_ENDIF);
@@ -465,7 +465,7 @@ gen6_gs_visitor::emit_thread_end()
     */
    this->current_annotation = "gen6 thread end: EOT";
 
-   if (c->prog_data.gen6_xfb_enabled) {
+   if (gs_prog_data->gen6_xfb_enabled) {
       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
       src_reg data(this, glsl_type::uint_type);
       emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
@@ -507,7 +507,7 @@ gen6_gs_visitor::setup_payload()
     * information (and move the original value to a virtual register if
     * necessary).
     */
-   if (c->prog_data.include_primitive_id)
+   if (gs_prog_data->include_primitive_id)
       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
    reg++;
 
@@ -530,9 +530,6 @@ gen6_gs_visitor::xfb_setup()
       BRW_SWIZZLE4(3, 3, 3, 3)
    };
 
-   struct brw_gs_prog_data *prog_data =
-      (struct brw_gs_prog_data *) &c->prog_data;
-
    const struct gl_transform_feedback_info *linked_xfb_info =
       &this->shader_prog->LinkedTransformFeedback;
    int i;
@@ -548,11 +545,11 @@ gen6_gs_visitor::xfb_setup()
     */
    assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
 
-   prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
-   for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
-      prog_data->transform_feedback_bindings[i] =
+   gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
+   for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
+      gs_prog_data->transform_feedback_bindings[i] =
          linked_xfb_info->Outputs[i].OutputRegister;
-      prog_data->transform_feedback_swizzles[i] =
+      gs_prog_data->transform_feedback_swizzles[i] =
          swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
    }
 }
@@ -561,13 +558,11 @@ void
 gen6_gs_visitor::xfb_write()
 {
    unsigned num_verts;
-   struct brw_gs_prog_data *prog_data =
-      (struct brw_gs_prog_data *) &c->prog_data;
 
-   if (!prog_data->num_transform_feedback_bindings)
+   if (!gs_prog_data->num_transform_feedback_bindings)
       return;
 
-   switch (c->prog_data.output_topology) {
+   switch (gs_prog_data->output_topology) {
    case _3DPRIM_POINTLIST:
       num_verts = 1;
       break;
@@ -642,10 +637,8 @@ gen6_gs_visitor::xfb_write()
 void
 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
 {
-   struct brw_gs_prog_data *prog_data =
-      (struct brw_gs_prog_data *) &c->prog_data;
    unsigned binding;
-   unsigned num_bindings = prog_data->num_transform_feedback_bindings;
+   unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
    src_reg sol_temp(this, glsl_type::uvec4_type);
 
    /* Check for buffer overflow: we need room to write the complete primitive
@@ -666,7 +659,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
        */
       for (binding = 0; binding < num_bindings; ++binding) {
          unsigned char varying =
-            prog_data->transform_feedback_bindings[binding];
+            gs_prog_data->transform_feedback_bindings[binding];
 
          /* Set up the correct destination index for this vertex */
          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
@@ -704,7 +697,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
          else if (varying == VARYING_SLOT_VIEWPORT)
             data.swizzle = BRW_SWIZZLE_ZZZZ;
          else
-            data.swizzle = prog_data->transform_feedback_swizzles[binding];
+            data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
 
          /* Write data */
          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index d02c67d8a74..311cf06833c 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -38,12 +38,13 @@ public:
    gen6_gs_visitor(const struct brw_compiler *comp,
                    void *log_data,
                    struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
                    struct gl_shader_program *prog,
                    const nir_shader *shader,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index) :
-      vec4_gs_visitor(comp, log_data, c, shader, mem_ctx, no_spills,
+      vec4_gs_visitor(comp, log_data, c, prog_data, shader, mem_ctx, no_spills,
                       shader_time_index),
       shader_prog(prog)
       {

From f3bc73073af70a7fe4d2612c80bfb5ae5820b963 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 21 Oct 2015 12:02:08 -0700
Subject: [PATCH 74/85] i965/gs: Use NIR info for setting up prog_data

Previously, we were pulling bits from GL data structures in order to set up
the prog_data.  However, in this brave new world of NIR, we want to be
pulling it out of the NIR shader whenever possible.  This way, we can move
all this setup code into brw_compile_gs without depending on the old GL
stuff.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_gs.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index d7ea2f043f9..f3d1e0b19fb 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -58,6 +58,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
                     struct brw_gs_prog_key *key)
 {
    struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
+   nir_shader *nir = gp->program.Base.nir;
    struct brw_stage_state *stage_state = &brw->gs.base;
    struct brw_gs_prog_data prog_data;
    struct brw_gs_compile c;
@@ -66,9 +67,9 @@ brw_codegen_gs_prog(struct brw_context *brw,
    c.key = *key;
 
    prog_data.include_primitive_id =
-      (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
+      (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) != 0;
 
-   prog_data.invocations = gp->program.Invocations;
+   prog_data.invocations = nir->info.gs.invocations;
 
    assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog,
                                    &gp->program.Base, &prog_data);
@@ -102,7 +103,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    }
 
    if (brw->gen >= 7) {
-      if (gp->program.OutputType == GL_POINTS) {
+      if (nir->info.gs.output_primitive == GL_POINTS) {
          /* When the output type is points, the geometry shader may output data
           * to multiple streams, and EndPrimitive() has no effect.  So we
           * configure the hardware to interpret the control data as stream ID.
@@ -110,7 +111,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
          prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
 
          /* We only have to emit control bits if we are using streams */
-         if (prog->Geom.UsesStreams)
+         if (nir->info.gs.uses_streams)
             c.control_data_bits_per_vertex = 2;
          else
             c.control_data_bits_per_vertex = 0;
@@ -126,20 +127,21 @@ brw_codegen_gs_prog(struct brw_context *brw,
          /* We only need to output control data if the shader actually calls
           * EndPrimitive().
           */
-         c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0;
+         c.control_data_bits_per_vertex =
+            nir->info.gs.uses_end_primitive ? 1 : 0;
       }
    } else {
       /* There are no control data bits in gen6. */
       c.control_data_bits_per_vertex = 0;
 
       /* If it is using transform feedback, enable it */
-      if (prog->TransformFeedback.NumVarying)
+      if (nir->info.has_transform_feedback_varyings)
          prog_data.gen6_xfb_enabled = true;
       else
          prog_data.gen6_xfb_enabled = false;
    }
    c.control_data_header_size_bits =
-      gp->program.VerticesOut * c.control_data_bits_per_vertex;
+      nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
 
    /* 1 HWORD = 32 bytes = 256 bits */
    prog_data.control_data_header_size_hwords =
@@ -240,7 +242,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    unsigned output_size_bytes;
    if (brw->gen >= 7) {
       output_size_bytes =
-         prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
+         prog_data.output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
       output_size_bytes += 32 * prog_data.control_data_header_size_hwords;
    } else {
       output_size_bytes = prog_data.output_vertex_size_hwords * 32;
@@ -269,7 +271,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
       prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
 
    prog_data.output_topology =
-      get_hw_prim_for_gl_prim(gp->program.OutputType);
+      get_hw_prim_for_gl_prim(nir->info.gs.output_primitive);
 
    /* The GLSL linker will have already matched up GS inputs and the outputs
     * of prior stages.  The driver does extend VS outputs in some cases, but
@@ -283,10 +285,10 @@ brw_codegen_gs_prog(struct brw_context *brw,
     * written by previous stages and shows up via payload magic.
     */
    GLbitfield64 inputs_read =
-      gp->program.Base.InputsRead & ~VARYING_BIT_PRIMITIVE_ID;
+      nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
    brw_compute_vue_map(brw->intelScreen->devinfo,
                        &c.input_vue_map, inputs_read,
-                       prog->SeparateShader);
+                       nir->info.separate_shader);
 
    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).

From 0e57694745979286cda0cd414cc6d1f4efe5408b Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 21 Oct 2015 12:03:21 -0700
Subject: [PATCH 75/85] i965/gs: Do prog_data setup and other calculations in
 brw_compile_gs

This commit moves the large pile of setup calculations we have to do for
geometry shaders out of brw_gs_emit and into brw_compile_gs.  This has a
couple of nice implications.  First, it's less work that the caller of
brw_compile_gs has to do.  Second, it's consistent with the vertex and
fragment stages.  Finally, it allows us to put brw_gs_compile back behind
the API boundary where it belongs.

v2 (Jason Ekstrand):
 - Pull the changes to use nir info into a separate patch
 - Put brw_gs_compile into brw_shader.h rather than brw_vec4_gs_visitor.h
   so that we can use it for scalar GS.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.h      |  14 +-
 src/mesa/drivers/dri/i965/brw_gs.c            | 205 +----------------
 src/mesa/drivers/dri/i965/brw_shader.h        |  12 +
 .../drivers/dri/i965/brw_vec4_gs_visitor.cpp  | 211 +++++++++++++++++-
 4 files changed, 222 insertions(+), 220 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 859cfa5c07e..10e1fde73f6 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -593,18 +593,6 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str);
 
-/**
- * Scratch data used when compiling a GLSL geometry shader.
- */
-struct brw_gs_compile
-{
-   struct brw_gs_prog_key key;
-   struct brw_vue_map input_vue_map;
-
-   unsigned control_data_bits_per_vertex;
-   unsigned control_data_header_size_bits;
-};
-
 /**
  * Compile a vertex shader.
  *
@@ -613,7 +601,7 @@ struct brw_gs_compile
 const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
-               struct brw_gs_compile *c,
+               const struct brw_gs_prog_key *key,
                struct brw_gs_prog_data *prog_data,
                const struct nir_shader *shader,
                struct gl_shader_program *shader_prog,
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index f3d1e0b19fb..dc59b06bac1 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -58,18 +58,9 @@ brw_codegen_gs_prog(struct brw_context *brw,
                     struct brw_gs_prog_key *key)
 {
    struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
-   nir_shader *nir = gp->program.Base.nir;
    struct brw_stage_state *stage_state = &brw->gs.base;
    struct brw_gs_prog_data prog_data;
-   struct brw_gs_compile c;
    memset(&prog_data, 0, sizeof(prog_data));
-   memset(&c, 0, sizeof(c));
-   c.key = *key;
-
-   prog_data.include_primitive_id =
-      (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) != 0;
-
-   prog_data.invocations = nir->info.gs.invocations;
 
    assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog,
                                    &gp->program.Base, &prog_data);
@@ -97,204 +88,12 @@ brw_codegen_gs_prog(struct brw_context *brw,
    brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
                                &prog_data.base.base, false);
 
-   if (brw->gen >= 8) {
-      prog_data.static_vertex_count =
-         nir_gs_count_vertices(gp->program.Base.nir);
-   }
-
-   if (brw->gen >= 7) {
-      if (nir->info.gs.output_primitive == GL_POINTS) {
-         /* When the output type is points, the geometry shader may output data
-          * to multiple streams, and EndPrimitive() has no effect.  So we
-          * configure the hardware to interpret the control data as stream ID.
-          */
-         prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
-
-         /* We only have to emit control bits if we are using streams */
-         if (nir->info.gs.uses_streams)
-            c.control_data_bits_per_vertex = 2;
-         else
-            c.control_data_bits_per_vertex = 0;
-      } else {
-         /* When the output type is triangle_strip or line_strip, EndPrimitive()
-          * may be used to terminate the current strip and start a new one
-          * (similar to primitive restart), and outputting data to multiple
-          * streams is not supported.  So we configure the hardware to interpret
-          * the control data as EndPrimitive information (a.k.a. "cut bits").
-          */
-         prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
-
-         /* We only need to output control data if the shader actually calls
-          * EndPrimitive().
-          */
-         c.control_data_bits_per_vertex =
-            nir->info.gs.uses_end_primitive ? 1 : 0;
-      }
-   } else {
-      /* There are no control data bits in gen6. */
-      c.control_data_bits_per_vertex = 0;
-
-      /* If it is using transform feedback, enable it */
-      if (nir->info.has_transform_feedback_varyings)
-         prog_data.gen6_xfb_enabled = true;
-      else
-         prog_data.gen6_xfb_enabled = false;
-   }
-   c.control_data_header_size_bits =
-      nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
-
-   /* 1 HWORD = 32 bytes = 256 bits */
-   prog_data.control_data_header_size_hwords =
-      ALIGN(c.control_data_header_size_bits, 256) / 256;
-
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
                        &prog_data.base.vue_map, outputs_written,
                        prog ? prog->SeparateShader : false);
 
-   /* Compute the output vertex size.
-    *
-    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
-    * Size (p168):
-    *
-    *     [0,62] indicating [1,63] 16B units
-    *
-    *     Specifies the size of each vertex stored in the GS output entry
-    *     (following any Control Header data) as a number of 128-bit units
-    *     (minus one).
-    *
-    *     Programming Restrictions: The vertex size must be programmed as a
-    *     multiple of 32B units with the following exception: Rendering is
-    *     disabled (as per SOL stage state) and the vertex size output by the
-    *     GS thread is 16B.
-    *
-    *     If rendering is enabled (as per SOL state) the vertex size must be
-    *     programmed as a multiple of 32B units. In other words, the only time
-    *     software can program a vertex size with an odd number of 16B units
-    *     is when rendering is disabled.
-    *
-    * Note: B=bytes in the above text.
-    *
-    * It doesn't seem worth the extra trouble to optimize the case where the
-    * vertex size is 16B (especially since this would require special-casing
-    * the GEN assembly that writes to the URB).  So we just set the vertex
-    * size to a multiple of 32B (2 vec4's) in all cases.
-    *
-    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
-    * budget that as follows:
-    *
-    *   512 bytes for varyings (a varying component is 4 bytes and
-    *             gl_MaxGeometryOutputComponents = 128)
-    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
-    *             bytes)
-    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
-    *             even if it's not used)
-    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
-    *             whenever clip planes are enabled, even if the shader doesn't
-    *             write to gl_ClipDistance)
-    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
-    *             (see below)--this causes up to 1 VUE slot to be wasted
-    *   400 bytes available for varying packing overhead
-    *
-    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
-    * per interpolation type, so this is plenty.
-    *
-    */
-   unsigned output_vertex_size_bytes = prog_data.base.vue_map.num_slots * 16;
-   assert(brw->gen == 6 ||
-          output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
-   prog_data.output_vertex_size_hwords =
-      ALIGN(output_vertex_size_bytes, 32) / 32;
-
-   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
-    * That divides up as follows:
-    *
-    *     64 bytes for the control data header (cut indices or StreamID bits)
-    *   4096 bytes for varyings (a varying component is 4 bytes and
-    *              gl_MaxGeometryTotalOutputComponents = 1024)
-    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
-    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
-    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
-    *              even if it's not used)
-    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
-    *              whenever clip planes are enabled, even if the shader doesn't
-    *              write to gl_ClipDistance)
-    *   4096 bytes overhead since the VUE size must be a multiple of 32
-    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
-    *   8128 bytes available for varying packing overhead
-    *
-    * Worst-case varying packing overhead is 3/4 of a varying slot per
-    * interpolation type, which works out to 3072 bytes, so this would allow
-    * us to accommodate 2 interpolation types without any danger of running
-    * out of URB space.
-    *
-    * In practice, the risk of running out of URB space is very small, since
-    * the above figures are all worst-case, and most of them scale with the
-    * number of output vertices.  So we'll just calculate the amount of space
-    * we need, and if it's too large, fail to compile.
-    *
-    * The above is for gen7+ where we have a single URB entry that will hold
-    * all the output. In gen6, we will have to allocate URB entries for every
-    * vertex we emit, so our URB entries only need to be large enough to hold
-    * a single vertex. Also, gen6 does not have a control data header.
-    */
-   unsigned output_size_bytes;
-   if (brw->gen >= 7) {
-      output_size_bytes =
-         prog_data.output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
-      output_size_bytes += 32 * prog_data.control_data_header_size_hwords;
-   } else {
-      output_size_bytes = prog_data.output_vertex_size_hwords * 32;
-   }
-
-   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
-    * which comes before the control header.
-    */
-   if (brw->gen >= 8)
-      output_size_bytes += 32;
-
-   assert(output_size_bytes >= 1);
-   int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
-   if (brw->gen == 6)
-      max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
-   if (output_size_bytes > max_output_size_bytes)
-      return false;
-
-
-   /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
-    * a multiple of 128 bytes in gen6.
-    */
-   if (brw->gen >= 7)
-      prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
-   else
-      prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
-
-   prog_data.output_topology =
-      get_hw_prim_for_gl_prim(nir->info.gs.output_primitive);
-
-   /* The GLSL linker will have already matched up GS inputs and the outputs
-    * of prior stages.  The driver does extend VS outputs in some cases, but
-    * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
-    * geometry shader support.  So we can safely ignore that.
-    *
-    * For SSO pipelines, we use a fixed VUE map layout based on variable
-    * locations, so we can rely on rendezvous-by-location making this work.
-    *
-    * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
-    * written by previous stages and shows up via payload magic.
-    */
-   GLbitfield64 inputs_read =
-      nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
-   brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.input_vue_map, inputs_read,
-                       nir->info.separate_shader);
-
-   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
-    * need to program a URB read length of ceiling(num_slots / 2).
-    */
-   prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
-
    if (unlikely(INTEL_DEBUG & DEBUG_GS))
       brw_dump_ir("geometry", prog, gs, NULL);
 
@@ -306,7 +105,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    unsigned program_size;
    char *error_str;
    const unsigned *program =
-      brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, &c,
+      brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, key,
                      &prog_data, shader->Program->nir, prog,
                      st_index, &program_size, &error_str);
    if (program == NULL) {
@@ -322,7 +121,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    }
 
    brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
-                    &c.key, sizeof(c.key),
+                    key, sizeof(*key),
                     program, program_size,
                     &prog_data, sizeof(prog_data),
                     &stage_state->prog_offset, &brw->gs.prog_data);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index b33b08f40d7..51b059fcaa1 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -233,6 +233,18 @@ bool opt_predicated_break(struct backend_shader *s);
 extern "C" {
 #endif
 
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+   struct brw_gs_prog_key key;
+   struct brw_vue_map input_vue_map;
+
+   unsigned control_data_bits_per_vertex;
+   unsigned control_data_header_size_bits;
+};
+
 struct brw_compiler *
 brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index faad1a8d1dc..9402489e628 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -601,7 +601,7 @@ vec4_gs_visitor::gs_end_primitive()
 extern "C" const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
-               struct brw_gs_compile *c,
+               const struct brw_gs_prog_key *key,
                struct brw_gs_prog_data *prog_data,
                const nir_shader *shader,
                struct gl_shader_program *shader_prog,
@@ -609,6 +609,209 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   struct brw_gs_compile c;
+   memset(&c, 0, sizeof(c));
+   c.key = *key;
+
+   prog_data->include_primitive_id =
+      (shader->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) != 0;
+
+   prog_data->invocations = shader->info.gs.invocations;
+
+   if (compiler->devinfo->gen >= 8)
+      prog_data->static_vertex_count = nir_gs_count_vertices(shader);
+
+   if (compiler->devinfo->gen >= 7) {
+      if (shader->info.gs.output_primitive == GL_POINTS) {
+         /* When the output type is points, the geometry shader may output data
+          * to multiple streams, and EndPrimitive() has no effect.  So we
+          * configure the hardware to interpret the control data as stream ID.
+          */
+         prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+
+         /* We only have to emit control bits if we are using streams */
+         if (shader_prog && shader_prog->Geom.UsesStreams)
+            c.control_data_bits_per_vertex = 2;
+         else
+            c.control_data_bits_per_vertex = 0;
+      } else {
+         /* When the output type is triangle_strip or line_strip, EndPrimitive()
+          * may be used to terminate the current strip and start a new one
+          * (similar to primitive restart), and outputting data to multiple
+          * streams is not supported.  So we configure the hardware to interpret
+          * the control data as EndPrimitive information (a.k.a. "cut bits").
+          */
+         prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+
+         /* We only need to output control data if the shader actually calls
+          * EndPrimitive().
+          */
+         c.control_data_bits_per_vertex =
+            shader->info.gs.uses_end_primitive ? 1 : 0;
+      }
+   } else {
+      /* There are no control data bits in gen6. */
+      c.control_data_bits_per_vertex = 0;
+
+      /* If it is using transform feedback, enable it */
+      if (shader->info.has_transform_feedback_varyings)
+         prog_data->gen6_xfb_enabled = true;
+      else
+         prog_data->gen6_xfb_enabled = false;
+   }
+   c.control_data_header_size_bits =
+      shader->info.gs.vertices_out * c.control_data_bits_per_vertex;
+
+   /* 1 HWORD = 32 bytes = 256 bits */
+   prog_data->control_data_header_size_hwords =
+      ALIGN(c.control_data_header_size_bits, 256) / 256;
+
+   /* Compute the output vertex size.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
+    * Size (p168):
+    *
+    *     [0,62] indicating [1,63] 16B units
+    *
+    *     Specifies the size of each vertex stored in the GS output entry
+    *     (following any Control Header data) as a number of 128-bit units
+    *     (minus one).
+    *
+    *     Programming Restrictions: The vertex size must be programmed as a
+    *     multiple of 32B units with the following exception: Rendering is
+    *     disabled (as per SOL stage state) and the vertex size output by the
+    *     GS thread is 16B.
+    *
+    *     If rendering is enabled (as per SOL state) the vertex size must be
+    *     programmed as a multiple of 32B units. In other words, the only time
+    *     software can program a vertex size with an odd number of 16B units
+    *     is when rendering is disabled.
+    *
+    * Note: B=bytes in the above text.
+    *
+    * It doesn't seem worth the extra trouble to optimize the case where the
+    * vertex size is 16B (especially since this would require special-casing
+    * the GEN assembly that writes to the URB).  So we just set the vertex
+    * size to a multiple of 32B (2 vec4's) in all cases.
+    *
+    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
+    * budget that as follows:
+    *
+    *   512 bytes for varyings (a varying component is 4 bytes and
+    *             gl_MaxGeometryOutputComponents = 128)
+    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *             bytes)
+    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *             even if it's not used)
+    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *             whenever clip planes are enabled, even if the shader doesn't
+    *             write to gl_ClipDistance)
+    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
+    *             (see below)--this causes up to 1 VUE slot to be wasted
+    *   400 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
+    * per interpolation type, so this is plenty.
+    *
+    */
+   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
+   assert(compiler->devinfo->gen == 6 ||
+          output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
+   prog_data->output_vertex_size_hwords =
+      ALIGN(output_vertex_size_bytes, 32) / 32;
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     64 bytes for the control data header (cut indices or StreamID bits)
+    *   4096 bytes for varyings (a varying component is 4 bytes and
+    *              gl_MaxGeometryTotalOutputComponents = 1024)
+    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
+    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *              even if it's not used)
+    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *              whenever clip planes are enabled, even if the shader doesn't
+    *              write to gl_ClipDistance)
+    *   4096 bytes overhead since the VUE size must be a multiple of 32
+    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
+    *   8128 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot per
+    * interpolation type, which works out to 3072 bytes, so this would allow
+    * us to accommodate 2 interpolation types without any danger of running
+    * out of URB space.
+    *
+    * In practice, the risk of running out of URB space is very small, since
+    * the above figures are all worst-case, and most of them scale with the
+    * number of output vertices.  So we'll just calculate the amount of space
+    * we need, and if it's too large, fail to compile.
+    *
+    * The above is for gen7+ where we have a single URB entry that will hold
+    * all the output. In gen6, we will have to allocate URB entries for every
+    * vertex we emit, so our URB entries only need to be large enough to hold
+    * a single vertex. Also, gen6 does not have a control data header.
+    */
+   unsigned output_size_bytes;
+   if (compiler->devinfo->gen >= 7) {
+      output_size_bytes =
+         prog_data->output_vertex_size_hwords * 32 * shader->info.gs.vertices_out;
+      output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
+   } else {
+      output_size_bytes = prog_data->output_vertex_size_hwords * 32;
+   }
+
+   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
+    * which comes before the control header.
+    */
+   if (compiler->devinfo->gen >= 8)
+      output_size_bytes += 32;
+
+   assert(output_size_bytes >= 1);
+   int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (compiler->devinfo->gen == 6)
+      max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (output_size_bytes > max_output_size_bytes)
+      return false;
+
+
+   /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
+    * a multiple of 128 bytes in gen6.
+    */
+   if (compiler->devinfo->gen >= 7)
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   else
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+
+   prog_data->output_topology =
+      get_hw_prim_for_gl_prim(shader->info.gs.output_primitive);
+
+   /* The GLSL linker will have already matched up GS inputs and the outputs
+    * of prior stages.  The driver does extend VS outputs in some cases, but
+    * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
+    * geometry shader support.  So we can safely ignore that.
+    *
+    * For SSO pipelines, we use a fixed VUE map layout based on variable
+    * locations, so we can rely on rendezvous-by-location making this work.
+    *
+    * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
+    * written by previous stages and shows up via payload magic.
+    */
+   GLbitfield64 inputs_read =
+      shader->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
+   brw_compute_vue_map(compiler->devinfo,
+                       &c.input_vue_map, inputs_read,
+                       shader->info.separate_shader);
+
+   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
+    * need to program a URB read length of ceiling(num_slots / 2).
+    */
+   prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+
+   /* Now that prog_data setup is done, we are ready to actually compile the
+    * program.
+    */
+
    if (compiler->devinfo->gen >= 7) {
       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
        * so without spilling. If the GS invocations count > 1, then we can't use
@@ -618,7 +821,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
          prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(compiler, log_data, c, prog_data, shader,
+         vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
                            mem_ctx, true /* no_spills */, shader_time_index);
          if (v.run()) {
             vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
@@ -660,11 +863,11 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
    const unsigned *ret = NULL;
 
    if (compiler->devinfo->gen >= 7)
-      gs = new vec4_gs_visitor(compiler, log_data, c, prog_data,
+      gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
                                shader, mem_ctx, false /* no_spills */,
                                shader_time_index);
    else
-      gs = new gen6_gs_visitor(compiler, log_data, c, prog_data, shader_prog,
+      gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, shader_prog,
                                shader, mem_ctx, false /* no_spills */,
                                shader_time_index);
 

From bea75227829512ab0e4766e00ac1b509c7586667 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 6 May 2015 00:04:10 -0700
Subject: [PATCH 76/85] i965: Introduce new
 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED/PER_SLOT opcodes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the vec4 backend, we have a vec4_instruction::urb_write_flags field.
There are many kinds of flags for SIMD4x2 messages.

However, there are really only two (per-slot offset, use channel masks)
for SIMD8 messages.  Rather than adding a boolean flag for per-slot
offsets (polluting all instructions), I decided to just make three new
opcodes.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_defines.h        |  3 +++
 src/mesa/drivers/dri/i965/brw_fs.cpp           |  9 +++++++++
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 11 +++++++++++
 src/mesa/drivers/dri/i965/brw_inst.h           |  1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp       |  9 +++++++++
 5 files changed, 33 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index e61ad545744..b2ce197faba 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1032,6 +1032,9 @@ enum opcode {
    SHADER_OPCODE_GEN7_SCRATCH_READ,
 
    SHADER_OPCODE_URB_WRITE_SIMD8,
+   SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT,
+   SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+   SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT,
 
    /**
     * Return the index of an arbitrary live channel (i.e. one of the channels
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index da90467e625..6f344c3bfdc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -281,6 +281,9 @@ fs_inst::is_send_from_grf() const
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
       return true;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
       return src[1].file == GRF;
@@ -781,6 +784,9 @@ fs_inst::regs_read(int arg) const
    switch (opcode) {
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -910,6 +916,9 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 13c495cd395..f4b6afa3732 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -368,6 +368,14 @@ fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
 
+   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
+
+   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
+
    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
    brw_inst_set_rlen(p->devinfo, insn, 0);
    brw_inst_set_eot(p->devinfo, insn, inst->eot);
@@ -2002,6 +2010,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 	 break;
 
       case SHADER_OPCODE_URB_WRITE_SIMD8:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 	 generate_urb_write(inst, src[0]);
 	 break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index db05a8a5f30..4ed95c473cd 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -393,6 +393,7 @@ FF(urb_per_slot_offset,
    /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1,
    /* 7:   */ MD(16), MD(16),
    /* 8:   */ MD(17), MD(17))
+FC(urb_channel_mask_present, MD(15), MD(15), devinfo->gen >= 8)
 FC(urb_complete, MD(15), MD(15), devinfo->gen < 8)
 FC(urb_used, MD(14), MD(14), devinfo->gen < 7)
 FC(urb_allocate, MD(13), MD(13), devinfo->gen < 7)
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 2324b56f583..94c201142e7 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -408,6 +408,12 @@ brw_instruction_name(enum opcode op)
       return "gen7_scratch_read";
    case SHADER_OPCODE_URB_WRITE_SIMD8:
       return "gen8_urb_write_simd8";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+      return "gen8_urb_write_simd8_per_slot";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+      return "gen8_urb_write_simd8_masked";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+      return "gen8_urb_write_simd8_masked_per_slot";
 
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
       return "find_live_channel";
@@ -961,6 +967,9 @@ backend_instruction::has_side_effects() const
    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_BARRIER:
       return true;

From ac98888afdc121e6eaafc9c5393647a2df4baef6 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 29 Sep 2015 14:32:02 -0700
Subject: [PATCH 77/85] i965: Introduce a new SHADER_OPCODE_URB_READ_SIMD8
 opcode.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In scalar mode, geometry shader inputs can easily take up hundreds of
registers.  This makes pushing VUE entries impractical; we'll need to
resort to the pull model in some cases.

To support this, we introduce a new opcode corresponding to the "URB
Read SIMD8" message.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_defines.h       |  9 +++++++
 src/mesa/drivers/dri/i965/brw_fs.cpp          |  2 ++
 src/mesa/drivers/dri/i965/brw_fs.h            |  1 +
 .../drivers/dri/i965/brw_fs_generator.cpp     | 26 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp      |  2 ++
 5 files changed, 40 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index b2ce197faba..bd7d0b1c9a7 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1031,6 +1031,15 @@ enum opcode {
    SHADER_OPCODE_GEN4_SCRATCH_WRITE,
    SHADER_OPCODE_GEN7_SCRATCH_READ,
 
+   /**
+    * Gen8+ SIMD8 URB Read message.
+    *
+    * Source 0: The header register, containing URB handles (g1).
+    *
+    * Currently only supports constant offsets, in inst->offset.
+    */
+   SHADER_OPCODE_URB_READ_SIMD8,
+
    SHADER_OPCODE_URB_WRITE_SIMD8,
    SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT,
    SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 6f344c3bfdc..436ee4d5f23 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -284,6 +284,7 @@ fs_inst::is_send_from_grf() const
    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case SHADER_OPCODE_URB_READ_SIMD8:
       return true;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
       return src[1].file == GRF;
@@ -787,6 +788,7 @@ fs_inst::regs_read(int arg) const
    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case SHADER_OPCODE_URB_READ_SIMD8:
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 171338dcc0b..f4d2e14b821 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -415,6 +415,7 @@ private:
                       struct brw_reg implied_header,
                       GLuint nr);
    void generate_fb_write(fs_inst *inst, struct brw_reg payload);
+   void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload);
    void generate_urb_write(fs_inst *inst, struct brw_reg payload);
    void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
    void generate_barrier(fs_inst *inst, struct brw_reg src);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index f4b6afa3732..bb7e792044f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -354,6 +354,28 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
    }
 }
 
+void
+fs_generator::generate_urb_read(fs_inst *inst,
+                                struct brw_reg dst,
+                                struct brw_reg header)
+{
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   brw_set_src1(p, send, brw_imm_ud(0u));
+
+   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
+
+   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
+   brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
+   brw_inst_set_header_present(p->devinfo, send, true);
+   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
+}
+
 void
 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
 {
@@ -2009,6 +2031,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          fill_count++;
 	 break;
 
+      case SHADER_OPCODE_URB_READ_SIMD8:
+         generate_urb_read(inst, dst, src[0]);
+         break;
+
       case SHADER_OPCODE_URB_WRITE_SIMD8:
       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 94c201142e7..d910e479c9d 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -414,6 +414,8 @@ brw_instruction_name(enum opcode op)
       return "gen8_urb_write_simd8_masked";
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
       return "gen8_urb_write_simd8_masked_per_slot";
+   case SHADER_OPCODE_URB_READ_SIMD8:
+      return "urb_read_simd8";
 
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
       return "find_live_channel";

From 72d84ae7ceaed4e723376a18d09689be183e0155 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 29 Sep 2015 14:43:29 -0700
Subject: [PATCH 78/85] i965: Introduce a
 brw_vue_prog_data::include_vue_handles flag.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tessellation shaders and SIMD8 geometry shaders may need to resort to
the pull model for inputs at times.  When set, the state upload code
will tell the hardware to provide URB handles for input data.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_compiler.h  | 3 +++
 src/mesa/drivers/dri/i965/gen8_gs_state.c | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 10e1fde73f6..6a9799e578e 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -485,6 +485,9 @@ struct brw_vue_prog_data {
    struct brw_stage_prog_data base;
    struct brw_vue_map vue_map;
 
+   /** Should the hardware deliver input VUE handles for URB pull loads? */
+   bool include_vue_handles;
+
    GLuint urb_read_length;
    GLuint total_grf;
 
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index d766ca7bebf..6738e85eaba 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -68,6 +68,8 @@ gen8_upload_gs_state(struct brw_context *brw)
                  GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) |
                 (brw->gs.prog_data->output_topology <<
                  GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) |
+                (prog_data->include_vue_handles ?
+                 GEN7_GS_INCLUDE_VERTEX_HANDLES : 0) |
                 (prog_data->urb_read_length <<
                  GEN6_GS_URB_READ_LENGTH_SHIFT) |
                 (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |

From 6ae419b94d7a7d2dfbb9a2908d28ca5aea4724e3 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 12 Mar 2015 05:52:13 -0700
Subject: [PATCH 79/85] i965: Make fs_visitor::emit_urb_writes reusable for
 scalar GS.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GS doesn't have ClampVertexColor, and we don't want to go through VS
structures.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index f825fed4daf..d7c4a6e213d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -868,13 +868,13 @@ void
 fs_visitor::emit_urb_writes()
 {
    int slot, urb_offset, length;
-   struct brw_vs_prog_data *vs_prog_data =
-      (struct brw_vs_prog_data *) prog_data;
-   const struct brw_vs_prog_key *key =
+   const struct brw_vue_prog_data *vue_prog_data =
+      (const struct brw_vue_prog_data *) this->prog_data;
+   const struct brw_vs_prog_key *vs_key =
       (const struct brw_vs_prog_key *) this->key;
    const GLbitfield64 psiz_mask =
       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
-   const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
+   const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
    bool flush;
    fs_reg sources[8];
 
@@ -961,11 +961,11 @@ fs_visitor::emit_urb_writes()
             break;
          }
 
-         if ((varying == VARYING_SLOT_COL0 ||
+         if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
+             (varying == VARYING_SLOT_COL0 ||
               varying == VARYING_SLOT_COL1 ||
               varying == VARYING_SLOT_BFC0 ||
-              varying == VARYING_SLOT_BFC1) &&
-             key->clamp_vertex_color) {
+              varying == VARYING_SLOT_BFC1)) {
             /* We need to clamp these guys, so do a saturating MOV into a
              * temp register and use that for the payload.
              */

From cb755996d91e9f44c93121f9534b0c59bb3ec201 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 12 Mar 2015 01:55:44 -0700
Subject: [PATCH 80/85] i965: Make emit_urb_writes() only set EOT for the VS.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GS will emit a bunch of vertices, and we don't want to do an EOT
prematurely.  We'll emit GS_OPCODE_THREAD_END when we want to terminate
the thread.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d7c4a6e213d..4610ea19c67 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1005,7 +1005,7 @@ fs_visitor::emit_urb_writes()
 
          fs_inst *inst =
             abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
-         inst->eot = last;
+         inst->eot = last && stage == MESA_SHADER_VERTEX;
          inst->mlen = length + 1;
          inst->offset = urb_offset;
          urb_offset = slot + 1;

From ac0a33666bdab6e4d9abca6ae6ee19cb03919dcc Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 5 May 2015 20:19:04 -0700
Subject: [PATCH 81/85] i965: Make emit_urb_writes() reserve space for GS
 header information.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Geometry shaders have additional header data at the beginning of their
output URB entries.  Shaders that use EndPrimitive() or multiple streams
have a control data header; shaders with a dynamic vertex count have an
additional vec4 slot to hold the 32-bit vertex count (and 96 bits of
padding).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 4610ea19c67..1b6a199fffa 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -868,6 +868,7 @@ void
 fs_visitor::emit_urb_writes()
 {
    int slot, urb_offset, length;
+   int starting_urb_offset = 0;
    const struct brw_vue_prog_data *vue_prog_data =
       (const struct brw_vue_prog_data *) this->prog_data;
    const struct brw_vs_prog_key *vs_key =
@@ -900,8 +901,21 @@ fs_visitor::emit_urb_writes()
       return;
    }
 
+   if (stage == MESA_SHADER_GEOMETRY) {
+      const struct brw_gs_prog_data *gs_prog_data =
+         (const struct brw_gs_prog_data *) prog_data;
+
+      /* We need to increment the Global Offset to skip over the control data
+       * header and the extra "Vertex Count" field (1 HWord) at the beginning
+       * of the VUE.  We're counting in OWords, so the units are doubled.
+       */
+      starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
+      if (gs_prog_data->static_vertex_count == -1)
+         starting_urb_offset += 2;
+   }
+
    length = 0;
-   urb_offset = 0;
+   urb_offset = starting_urb_offset;
    flush = false;
    for (slot = 0; slot < vue_map->num_slots; slot++) {
       int varying = vue_map->slot_to_varying[slot];
@@ -1008,7 +1022,7 @@ fs_visitor::emit_urb_writes()
          inst->eot = last && stage == MESA_SHADER_VERTEX;
          inst->mlen = length + 1;
          inst->offset = urb_offset;
-         urb_offset = slot + 1;
+         urb_offset = starting_urb_offset + slot + 1;
          length = 0;
          flush = false;
       }

From 55dfd39b5f18f820694cad74ce40a3e0d3d6a0c4 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 26 Jun 2015 16:20:21 -0700
Subject: [PATCH 82/85] i965: Add a brw->scalar_gs flag controlled by
 INTEL_SCALAR_GS=1.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a brw->scalar_gs flag, similar to brw->scalar_vs,
which controls whether or not to use SIMD8 geometry shaders.

For now, we control it via a new environment variable, INTEL_SCALAR_GS.
This provides a convenient way to try it out.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_compiler.h | 1 +
 src/mesa/drivers/dri/i965/brw_gs.c       | 3 ++-
 src/mesa/drivers/dri/i965/brw_shader.cpp | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 6a9799e578e..742bac4e8ef 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -90,6 +90,7 @@ struct brw_compiler {
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
    bool scalar_vs;
+   bool scalar_gs;
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index dc59b06bac1..ed0890f430f 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -57,6 +57,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
                     struct brw_geometry_program *gp,
                     struct brw_gs_prog_key *key)
 {
+   struct brw_compiler *compiler = brw->intelScreen->compiler;
    struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    struct brw_stage_state *stage_state = &brw->gs.base;
    struct brw_gs_prog_data prog_data;
@@ -86,7 +87,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
    prog_data.base.base.nr_image_params = gs->NumImages;
 
    brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
-                               &prog_data.base.base, false);
+                               &prog_data.base.base, compiler->scalar_gs);
 
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d910e479c9d..0ac4f2f6e0d 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -79,6 +79,8 @@ is_scalar_shader_stage(const struct brw_compiler *compiler, int stage)
    case MESA_SHADER_FRAGMENT:
    case MESA_SHADER_COMPUTE:
       return true;
+   case MESA_SHADER_GEOMETRY:
+      return compiler->scalar_gs;
    case MESA_SHADER_VERTEX:
       return compiler->scalar_vs;
    default:
@@ -101,6 +103,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    if (devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
       compiler->scalar_vs = true;
 
+   if (devinfo->gen >= 8 && brw_env_var_as_boolean("INTEL_SCALAR_GS", false))
+      compiler->scalar_gs = true;
+
    nir_shader_compiler_options *nir_options =
       rzalloc(compiler, nir_shader_compiler_options);
    nir_options->native_integers = true;

From b3ebf03b8406f9f1cd215b98ebdd3fc751f73559 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 29 Jun 2015 22:50:28 -0700
Subject: [PATCH 83/85] i965: Add a fs_visitor constructor that takes a
 brw_gs_compile.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unlike the vs/wm structs, brw_gs_compile is actually useful: it contains
the input VUE map and information about the control data headers.
Passing this in allows us to share that code in brw_gs.c, and calculate
them before deciding on vec4 vs. scalar mode, as it's independent of
that choice.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs.h           | 11 ++++++-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 31 ++++++++++++++++++--
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f4d2e14b821..50e98becf03 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -62,6 +62,8 @@ namespace brw {
    class fs_live_variables;
 }
 
+struct brw_gs_compile;
+
 static inline fs_reg
 offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
 {
@@ -99,7 +101,12 @@ public:
               const nir_shader *shader,
               unsigned dispatch_width,
               int shader_time_index);
-
+   fs_visitor(const struct brw_compiler *compiler, void *log_data,
+              void *mem_ctx,
+              struct brw_gs_compile *gs_compile,
+              struct brw_gs_prog_data *prog_data,
+              const nir_shader *shader);
+   void init();
    ~fs_visitor();
 
    fs_reg vgrf(const glsl_type *const type);
@@ -298,6 +305,8 @@ public:
    const void *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
+   struct brw_gs_compile *gs_compile;
+
    struct brw_stage_prog_data *prog_data;
    struct gl_program *prog;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 1b6a199fffa..7cc4f3c927a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -41,6 +41,7 @@
 #include "brw_wm.h"
 #include "brw_cs.h"
 #include "brw_vec4.h"
+#include "brw_vec4_gs_visitor.h"
 #include "brw_fs.h"
 #include "main/uniforms.h"
 #include "glsl/nir/glsl_types.h"
@@ -1085,11 +1086,33 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        unsigned dispatch_width,
                        int shader_time_index)
    : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
-     key(key), prog_data(prog_data), prog(prog),
+     key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
      dispatch_width(dispatch_width),
      shader_time_index(shader_time_index),
-     promoted_constants(0),
      bld(fs_builder(this, dispatch_width).at_end())
+{
+   init();
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
+                       void *mem_ctx,
+                       struct brw_gs_compile *c,
+                       struct brw_gs_prog_data *prog_data,
+                       const nir_shader *shader)
+   : backend_shader(compiler, log_data, mem_ctx, shader,
+                    &prog_data->base.base),
+     key(&c->key), gs_compile(c),
+     prog_data(&prog_data->base.base), prog(NULL),
+     dispatch_width(8),
+     shader_time_index(ST_GS),
+     bld(fs_builder(this, dispatch_width).at_end())
+{
+   init();
+}
+
+
+void
+fs_visitor::init()
 {
    switch (stage) {
    case MESA_SHADER_FRAGMENT:
@@ -1108,6 +1131,8 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
       unreachable("unhandled shader stage");
    }
 
+   this->prog_data = this->stage_prog_data;
+
    this->failed = false;
    this->simd16_unsupported = false;
    this->no16_msg = NULL;
@@ -1133,6 +1158,8 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
    this->pull_constant_loc = NULL;
    this->push_constant_loc = NULL;
 
+   this->promoted_constants = 0,
+
    this->spilled_any_registers = false;
    this->do_dual_src = false;
 

From c5ae34f38f239d346090212a9f33a947a3b7642e Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 23 Sep 2015 18:59:57 -0700
Subject: [PATCH 84/85] i965: Implement nir_intrinsic_load_primitive.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 792663f2644..123e86eca37 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -30,6 +30,7 @@
 #include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 #include "brw_fs_surface_builder.h"
+#include "brw_vec4_gs_visitor.h"
 
 using namespace brw;
 using namespace brw::surface_access;
@@ -1367,6 +1368,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_vertex_id:
       unreachable("should be lowered by lower_vertex_id()");
 
+   case nir_intrinsic_load_primitive_id:
+      assert(stage == MESA_SHADER_GEOMETRY);
+      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+      break;
+
    case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
    case nir_intrinsic_load_instance_id:

From 48c76eae8e52fba2fe22d2cfa7f3c94a5420feb2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 10 Jul 2015 00:16:19 -0700
Subject: [PATCH 85/85] i965: Implement gl_InvocationID.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's stored in bits 31:27 of g1 (along with the URB handles).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 123e86eca37..e1fb12060c8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -189,6 +189,18 @@ emit_system_values_block(nir_block *block, void *void_visitor)
             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
          break;
 
+      case nir_intrinsic_load_invocation_id:
+         assert(v->stage == MESA_SHADER_GEOMETRY);
+         reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+         if (reg->file == BAD_FILE) {
+            const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
+            fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+            fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            abld.SHR(iid, g1, fs_reg(27u));
+            *reg = iid;
+         }
+         break;
+
       case nir_intrinsic_load_sample_pos:
          assert(v->stage == MESA_SHADER_FRAGMENT);
          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
@@ -1378,6 +1390,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
    case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_invocation_id:
    case nir_intrinsic_load_sample_mask_in:
    case nir_intrinsic_load_sample_id: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);